import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
import boto3
import chart_studio
import chart_studio.plotly as py
C:\Users\jcf\AppData\Local\Programs\Python\Python311\Lib\site-packages\sentence_transformers\cross_encoder\CrossEncoder.py:11: TqdmExperimentalWarning: Using `tqdm.autonotebook.tqdm` in notebook mode. Use `tqdm.tqdm` instead to force console mode (e.g. in jupyter console) from tqdm.autonotebook import tqdm, trange
public data: https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset
personal data: spotify API
The objective of this analysis is to get Spotify data for a particular user library and try to cluster the genres in 3 different ways and compare them. As the genre of a song is a partial subjective topic, conclusions could vary from person to person.
There are two datasets: one from the user, got using the Spotify API to retrieve the user's library tracks; the other is a public dataset containing 1.000.000 tracks with mostly the same information as the user data.
User and public data contain information of tracks, including title, artist, duration, and some features used internally in Spotify and assigned to each track, such as danceability, acousticness, etc. User data contains some extra features, such as number of sections, tempo changes, and some more.
The meaning of the features can be seen in the API documentation: https://developer.spotify.com/documentation/web-api/reference/get-track
def read_full_table(table_name):
session = boto3.Session(profile_name="default")
dynamodb = session.resource("dynamodb", region_name="eu-west-1")
table = dynamodb.Table(table_name)
response = table.scan()
data = response["Items"]
while "LastEvaluatedKey" in response:
response = table.scan(ExclusiveStartKey=response["LastEvaluatedKey"])
data.extend(response["Items"])
return pd.DataFrame(data)
numerical_cols = ["num_sections", "danceability", "sections_avg_duration", "instrumentalness", "liveness", "loudness",
"duration", "speechiness", "valence", "dynamics_changes", "tempo_changes", "acousticness",
"time_signature_changes", "popularity", "mode_changes", "energy", "key_changes", "tempo"]
numerical_cols_public = ["danceability", "instrumentalness", "liveness", "loudness", "duration", "speechiness",
"valence", "acousticness", "popularity", "energy", "tempo"]
non_standarized_cols = ["num_sections", "sections_avg_duration", "loudness", "duration", "dynamics_changes",
"tempo_changes", "time_signature_changes", "popularity", "mode_changes", "key_changes", "tempo"]
categorical_cols = ["key", "mode"]
notes = ("C", "C#", "D", "Eb", "E", "F", "F#", "G", "Ab", "A", "Bb", "B")
key_mapping = {i:note for i, note in enumerate(notes)}
key_mapping[-1] = "NoKey"
mode_mapping = {0: "Minor", 1: "Major"}
random_state = 602452
ssm = boto3.client("ssm", region_name="eu-west-1")
chart_studio_api_key = ssm.get_parameter(Name="CHART_STUDIO_API_KEY", WithDecryption=True)
chart_studio_api_key = chart_studio_api_key["Parameter"]["Value"]
chart_studio.tools.set_credentials_file(username='jcf94', api_key=chart_studio_api_key)
track_info_raw = read_full_table("track_info")
public_data_raw = pd.read_csv(r"C:\Users\jcf\Desktop\codigo\Portfolio\Spotify Analysis\public_music_data.csv")
track_info = track_info_raw.copy()
public_data = public_data_raw.copy()
track_info["key"] = track_info["key"].map(key_mapping)
track_info["mode"] = track_info["mode"].map(mode_mapping)
public_data["key"] = public_data["key"].map(key_mapping)
public_data["mode"] = public_data["mode"].map(mode_mapping)
for col in numerical_cols:
track_info[col] = pd.to_numeric(track_info[col])
for col in track_info.columns:
if "changes" in col:
track_info[col] = track_info[col] / track_info["duration"]
track_info["case"] = "user"
public_data["duration"] = public_data["duration_ms"] / 1000
public_data["case"] = "public"
genre_info = track_info.explode("genres")
genre_info = genre_info.loc[:, ["genres", "track_id", "artist"]]
genre_info = genre_info.groupby(["genres"]).nunique().reset_index()
genre_info["track_perc"] = 100 * genre_info["track_id"] / genre_info["track_id"].sum()
genre_info["artist_perc"] = 100 * genre_info["artist"] / genre_info["artist"].sum()
genre_info_public = public_data.loc[:, ["track_genre", "track_id", "artists"]]
genre_info_public = genre_info_public.groupby(["track_genre"]).nunique().reset_index()
genre_info_public["track_perc"] = 100 * genre_info_public["track_id"] / genre_info_public["track_id"].sum()
genre_info_public["artist_perc"] = 100 * genre_info_public["artists"] / genre_info_public["artists"].sum()
for row in track_info.genres.sample(5):
print(row)
['album rock', 'art rock', 'classic rock', 'hard rock', 'mellow gold', 'progressive rock', 'rock', 'soft rock', 'symphonic rock'] ['chamber pop', 'indie rock', 'jam band', 'louisville indie', 'melancholia', 'roots rock', 'stomp and holler'] ['djent', 'melodic metalcore', 'progressive metal', 'progressive metalcore'] ['instrumental rock'] ['alternative metal', 'hard rock', 'melodic thrash', 'metal', 'old school thrash', 'rock', 'speed metal', 'thrash metal']
An example of genres are shown. As it can be seen, a song can have multiple genres.
First, an Exploratory Data Analysis is done, to see how many genres are in each dataset and in which quantity.
genre_info_plot = genre_info.sort_values(by="track_perc", ascending=False)
genre_info_plot["track_perc_accum"] = genre_info_plot["track_perc"].cumsum()
limit = 90
total_elements = genre_info_plot["genres"].nunique()
top_elements = genre_info_plot.loc[genre_info_plot["track_perc_accum"] <= limit, "genres"].nunique()
print(f"{total_elements=}, top {limit}% elements={top_elements}")
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(
go.Scatter(x=genre_info_plot["genres"], y=genre_info_plot["track_perc_accum"], name="% Accum"),
secondary_y=True,
)
fig.add_trace(
go.Bar(x=genre_info_plot["genres"], y=genre_info_plot["track_perc"], name="%"),
secondary_y=False,
)
fig.update_layout(
title_text="User Track Genres"
)
fig.show()
py.plot(fig, filename="user_genres_distribution", auto_open=False)
total_elements=241, top 90% elements=83
'https://plotly.com/~jcf94/3/'
The genres are shown for the user data, both by track and by artist (how many songs of a certain genre are in the user library, and how many artists are of a certain genre). Genre data is supported for artist in the Spotify API, and not by track, so for each track, the artist genre is used.
As it can be seen, this user listens mainly to rock and metal, but also some funk, jazz and fusion genres, as well as more alterntive.
There are over 240 different genres, and only 84 of them represent 90% of the total genres in the dataset. For the rest of the analysis, only these 84 genres are taken into account, as they represent the majority of the dataset, and a high number of groups can lead to problems and difficulty in the clustering (most groups are underrepresented, so the results are very mixed groups).
A comparison of the genres in the user data and public data is shown, to see if they are similar or very different. First, the genres in the user data that are not in the public database are shown, as well as the number of them, absolute and in percentage.
Arount 95% of genres in the user data are not in the public database, and around 90% of genres in the public database are not in the user data, so both databases are vastly different in terms of genres. This could create some problems in ML, as the public data does not represent the user data, and any algorithms made from one database could not be applicable to the other.
public_genres = genre_info_public["track_genre"].unique()
user_genres = genre_info["genres"].unique()
user_genres_not_public = set(user_genres).difference(set(public_genres))
pulic_genres_not_user = set(public_genres).difference(set(user_genres))
print(f"{user_genres_not_public=}\n({len(user_genres_not_public)}, {100 * len(user_genres_not_public) / len(user_genres)}%)")
print(f"{pulic_genres_not_user=}\n({len(pulic_genres_not_user)}, {100 * len(pulic_genres_not_user) / len(public_genres)})%")
user_genres_not_public={'post-hardcore', 'british math rock', 'jazztronica', 'conscious hip hop', 'instrumental math rock', 'stoner rock', 'microtonal', 'breakcore', 'blackgaze', 'groove metal', 'chamber pop', 'slacker rock', 'modern rock', 'greek psychedelic rock', 'french shoegaze', 'flute rock', 'avant-garde metal', 'instrumental rock', 'contemporary jazz', 'roots rock', 'britpop', 'experimental indie rock', 'boston rock', 'hard rock', 'folk rock', 'rare groove', 'merseybeat', 'comedy rock', 'acid rock', 'djent', 'atmospheric post-metal', 'nwobhm', 'taiwan indie', 'soft rock', 'progressive rock', 'israeli metal', 'synth prog', 'san diego indie', 'shimmer pop', 'brazilian progressive metal', 'psychedelic soul', 'post-black metal', 'trip hop', 'classic texas country', 'rock drums', 'permanent wave', 'birmingham metal', 'ann arbor indie', 'dance-punk', 'progressive jazz fusion', 'melodic metalcore', 'political hip hop', 'swedish doom metal', 'doom metal', 'drill and bass', 'british jazz', 'krautrock', 'late romantic era', 'southern soul', 'yacht rock', 'palm desert scene', 'jazz metal', 'atmospheric black metal', 'blues rock', 'norwegian prog', 'progressive death metal', 'italian progressive rock', 'jazz fusion', 'experimental rock', 'art rock', 'funk metal', 'garage rock', 'indietronica', 'mellow gold', 'progressive sludge', 'instrumental bluegrass', 'piano rock', 'zolo', 'australian metal', 'instrumental post-rock', 'new wave', 'chapman stick', 'space rock', 'uk doom metal', 'instrumental stoner rock', 'experimental', 'proto-metal', 'north carolina metal', 'noise rock', 'intelligent dance music', 'funktronica', 'sci-fi metal', 'indie jazz', 'industrial metal', 'industrial rock', 'spacegrunge', 'jazz piano', 'nu metal', 'cybergrind', 'swedish metal', 'psychedelic rock', 'funk rock', 'experimental pop', 'german rock', 'speed metal', 'video game music', 'double drumming', 'laboratorio', 'modern alternative rock', 'opera metal', 'melbourne punk', 'progressive metal', 'alternative rock', 'afrofuturism', 'electric bass', 'post-punk', 'neo classical metal', 'american post-rock', 'dance pop', 'noise pop', 'british invasion', 'french metal', 'glam rock', 'album rock', 'prog metal', 'dance rock', 'acoustic rock', 'nu gaze', 'british blues', 'melodic thrash', 'old school thrash', 'uk post-punk', 'taiwan post-rock', 'art pop', 'swedish progressive metal', 'madchester', 'bebop', 'indie rock', 'technical thrash', 'japanese vgm', 'rap metal', 'modern jazz trio', 'comic', 'parody', 'scottish rock', 'instrumental funk', 'jazz rock', 'cosmic american', 'electric blues', 'german metal', 'math rock', 'contemporary post-bop', 'japanese math rock', 'french black metal', 'cascadia psych', 'otacore', 'glam metal', 'gothic metal', 'progressive metalcore', 'psychobilly', 'symphonic rock', 'shoegaze', 'southeast asian post-rock', 'texas metal', 'crank wave', 'japanese post-rock', 'abstract', 'country rock', 'metal guitar', 'classic japanese jazz', 'stomp and holler', 'progressive groove metal', 'post-grunge', 'shred', 'melancholia', 'electronic djent', 'italian baroque', 'canadian metal', 'beatlesque', 'dream pop', 'motown', 'progressive bluegrass', 'thrash metal', 'alternative dance', 'technical groove metal', 'dark pop', 'jam band', 'electronic rock', 'french death metal', 'jazz funk', 'atlanta metal', 'classic rock', 'sillycore', 'no wave', 'polish prog', 'post-rock', 'stoner metal', 'comic metal', 'synth funk', 'sludge metal', 'louisville indie', 'el paso indie', 'australian psych', 'canterbury scene', 'supergroup', 'oxford indie', 'sacramento indie', 'classic soul', 'modern hard rock', 'rap rock', 'metal cearense', 'modern progressive rock', 'post-metal', 'german stoner rock', 'electronica', 'german hard rock', 'mexican classic rock', 'classic canadian rock', 'atmospheric sludge', 'neo-psychedelic', 'cyberpunk', 'baroque', 'emotional black metal', 'art punk', 'indie catala', 'modern blues rock', 'anti-folk', 'p funk', 'instrumental djent', 'alternative metal', 'midwest emo'}
(231, 95.850622406639%)
pulic_genres_not_user={'chicago-house', 'cantopop', 'children', 'house', 'guitar', 'alternative', 'afrobeat', 'hardstyle', 'songwriter', 'punk-rock', 'chill', 'latin', 'detroit-techno', 'ska', 'black-metal', 'folk', 'sertanejo', 'techno', 'heavy-metal', 'grindcore', 'party', 'samba', 'j-dance', 'gospel', 'malay', 'honky-tonk', 'bluegrass', 'trip-hop', 'k-pop', 'pop-film', 'mandopop', 'psych-rock', 'sleep', 'reggae', 'british', 'dub', 'mpb', 'happy', 'reggaeton', 'blues', 'indie', 'comedy', 'progressive-house', 'goth', 'indian', 'groove', 'study', 'synth-pop', 'r-n-b', 'brazil', 'show-tunes', 'country', 'death-metal', 'piano', 'drum-and-bass', 'kids', 'power-pop', 'latino', 'j-pop', 'hard-rock', 'j-rock', 'salsa', 'world-music', 'swedish', 'opera', 'garage', 'trance', 'sad', 'electronic', 'spanish', 'hip-hop', 'acoustic', 'new-age', 'electro', 'tango', 'dubstep', 'alt-rock', 'rock-n-roll', 'idm', 'club', 'dance', 'hardcore', 'punk', 'dancehall', 'rockabilly', 'french', 'pop', 'ambient', 'disco', 'anime', 'metalcore', 'forro', 'indie-pop', 'edm', 'deep-house', 'romance', 'iranian', 'turkish', 'breakbeat', 'disney', 'pagode', 'german', 'j-idol', 'minimal-techno'}
(104, 91.2280701754386)%
In this section, instead of using the features of each track, genres are tried to be grouped using only the semantic similarity between them. This could group similar genres that share common words (such as a "main" genre and its subgenres: rock, alternative rock, progressive rock, funk rock, etc), but will probably fail to cluster genres that share common traits but no semantic similarity (for example, alt rock and indie, or djent and death metal).
A SentenceTransformer is used with a pre-trained model to convert the string into numerical features, and then a KMeans model is used for clustering.
All unique genres are clustered in 10 groups, and must be manually checked in order to see if they have sense.
A PCA is also applied to draw a simple 2D plot in order to see if the groups have some sense, although it's hard to see multidimensional information reduced to only 2 dimensions, so some info is lost when plotting.
genre_info_sorted = genre_info.sort_values(by=f"track_perc", ascending=False)
genre_info_sorted[f"track_perc_accum"] = genre_info_sorted[f"track_perc"].cumsum()
limit = 90
genre_info_sorted = genre_info_sorted.loc[genre_info_sorted[f"track_perc_accum"] <= limit, "genres"].unique()
genres = genre_info_sorted.tolist()
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings = model.encode(genres)
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=random_state)
kmeans.fit(embeddings)
labels = [str(e) for e in kmeans.labels_]
# Print the genres grouped by cluster
clusters = {}
for genre, label in zip(genres, labels):
if label not in clusters:
clusters[label] = []
clusters[label].append(genre)
clusters = dict(sorted(clusters.items()))
for cluster_id, genre_list in clusters.items():
print(f"Cluster {cluster_id}: {', '.join(genre_list)}\n")
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(embeddings)
fig = px.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], color=labels)
fig.show()
py.plot(fig, filename="PCA_semantic_clustering", auto_open=False)
C:\Users\jcf\AppData\Local\Programs\Python\Python311\Lib\site-packages\huggingface_hub\file_download.py:1132: FutureWarning:
`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
C:\Users\jcf\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\externals\loky\backend\context.py:136: UserWarning:
Could not find the number of physical cores for the following reason:
found 0 physical cores < 1
Returning the number of logical cores instead. You can silence this warning by setting LOKY_MAX_CPU_COUNT to the number of cores you want to use.
File "C:\Users\jcf\AppData\Local\Programs\Python\Python311\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
raise ValueError(f"found {cpu_count_physical} physical cores < 1")
Cluster 0: nu metal Cluster 1: funk rock, funk metal, dance pop, instrumental funk, jam band, conscious hip hop, funk, political hip hop, rap rock, p funk Cluster 2: alternative metal, progressive metal, metal, groove metal, progressive groove metal, french metal, french death metal, rap metal, swedish metal, speed metal, canadian metal, swedish progressive metal, stoner metal, technical groove metal, progressive death metal Cluster 3: progressive rock, classic rock, album rock, symphonic rock, instrumental rock, psychedelic rock, el paso indie, classic canadian rock, electric bass, indie rock, instrumental math rock, blues rock, singer-songwriter Cluster 4: grunge, post-grunge, melancholia, oxford indie, microtonal, australian psych, neo-psychedelic, mellow gold, zolo, supergroup, palm desert scene, uk post-punk, shoegaze, noise pop, trip hop Cluster 5: progressive jazz fusion, jazz, double drumming, contemporary jazz, jazz rock, jazz fusion, jazz funk, sacramento indie, jazz metal Cluster 6: old school thrash, thrash metal, melodic thrash, technical thrash Cluster 7: rock, alternative rock, art rock, hard rock, modern rock, garage rock, soft rock, stoner rock, math rock, glam rock, post-rock, acid rock Cluster 8: djent, instrumental djent Cluster 9: permanent wave, new wave
'https://plotly.com/~jcf94/5/'
Cluster 0 contains only 2 genres that are not related at all (one being more classic 70-80s soft rock, and the other more modern, dark metal). However, seeing the other clusters, this seems more like a "other" group, where the genres included are the ones that did not fit well with the others.
Cluster 1 seems to have rythmic genres, mostly funk and related, but also some "jam" genres (where there is room for improvisation), such as jam band and blues.
Cluster 2 includes thrash metal and some subgenres of it.
Cluster 3 is rock and several subgenres.
Cluster 4 comprises more grunge and "weird" music, such as microtonal and new wave. These genres tend to be more experimental.
Cluster 5 is jazz and variants..
Cluster 6 is more "spoken" music, mostly hip hop.
Cluster 7 is more indie music and variants.
Cluster 8 is mostly metal and subgenres of it.
Finally, cluster 9 comprises genres with empahsis on repetition and low frequency, rythmic sounds.
It seems the clustering is sensible, as most of the grouped genres share traits and could be grouped together if the objective is to reduce the number of genres (we went from 84 unique genres to 10).
In order to check if these groups have sense in the real data, we assign the clusters to each genre, and a sample of 10 songs are drawn randomly from each cluster. If the grouping is sensible, these tracks should really have common traits. It should be taken into account the fact that each track can have several genres, so a track can be in more than one cluster. As this multiple genre trait is not quantified (it's not known "how much of a genre" has a certain song), this could be problematic, as a track can be clustered in several groups but not all of them are "correct". How "correct" is a song is also subjective.
def assign_cluster(x):
clusters_row = []
for k, v in clusters.items():
if set(x) & set(v):
clusters_row.append(k)
return clusters_row
track_info["cluster_NLP"] = track_info["genres"].apply(assign_cluster)
df_exploded = track_info.explode("cluster_NLP")
df_exploded_csv = df_exploded[categorical_cols + numerical_cols + ["cluster_NLP", "genres", "track_name", "artist"]]
df_exploded_csv.to_csv("df_exploded_clustering.csv")
for i in range(0, num_clusters):
_df = df_exploded.loc[df_exploded["cluster_NLP"] == str(i), ["track_name", "artist", "genres", "track_url"]]
elements = min(10, _df.shape[0])
print(f"Cluster {i}")
print(_df.sample(elements).values)
Cluster 0 [['To Defy The Laws Of Tradition' 'Primus' list(['alternative metal', 'alternative rock', 'funk metal', 'funk rock', 'grunge', 'nu metal']) 'https://open.spotify.com/track/5bVSGG70polmbENWS4j6Uy'] ["Mind's Mirrors / In Death - Is Life / In Death - Is Death - Live" 'Meshuggah' list(['alternative metal', 'djent', 'groove metal', 'metal', 'nu metal', 'progressive groove metal', 'swedish metal', 'technical groove metal', 'technical thrash']) 'https://open.spotify.com/track/7snx3JnD5PxvEJ7SZeArA8'] ['The Shooting Star' 'Gojira' list(['alternative metal', 'french death metal', 'french metal', 'groove metal', 'metal', 'nu metal', 'progressive groove metal']) 'https://open.spotify.com/track/6HQfFAupOMsmfWV4CbG1Kj'] ['The Fall' 'Gojira' list(['alternative metal', 'french death metal', 'french metal', 'groove metal', 'metal', 'nu metal', 'progressive groove metal']) 'https://open.spotify.com/track/3ONrh7cP8vezCw1Q3UJNOn'] ['Over The Electric Grapevine' 'Primus' list(['alternative metal', 'alternative rock', 'funk metal', 'funk rock', 'grunge', 'nu metal']) 'https://open.spotify.com/track/42mybYyPzY5S6btIgjMmD6'] ['Liquid Fire' 'Gojira' list(['alternative metal', 'french death metal', 'french metal', 'groove metal', 'metal', 'nu metal', 'progressive groove metal']) 'https://open.spotify.com/track/4M8nCBcSypR8Nanmor2tNQ'] ['Stinkfist' 'TOOL' list(['alternative metal', 'art rock', 'nu metal', 'post-grunge', 'progressive metal', 'progressive rock', 'rock']) 'https://open.spotify.com/track/0pwObEOHolQZSldJ2q1wpy'] ['The Heaviest Matter of the Universe' 'Gojira' list(['alternative metal', 'french death metal', 'french metal', 'groove metal', 'metal', 'nu metal', 'progressive groove metal']) 'https://open.spotify.com/track/2pZsQqXFgcY03vRyZxSQhU'] ['The art of dying' 'Gojira' list(['alternative metal', 'french death metal', 'french metal', 'groove metal', 'metal', 'nu metal', 'progressive groove metal']) 'https://open.spotify.com/track/7iMQChXFK33TS49QWhE4tt'] ['The Hand That Feeds' 'Nine Inch Nails' list(['alternative metal', 'alternative rock', 'cyberpunk', 'electronic rock', 'grunge', 'industrial', 'industrial metal', 'industrial rock', 'nu metal', 'post-grunge', 'rock']) 'https://open.spotify.com/track/4L5r9XNiWJ7cbqcIiNvVes']] Cluster 1 [['Dynamite' 'Jamiroquai' list(['dance pop']) 'https://open.spotify.com/track/4LkJXlCD9AYjScPywhUjZs'] ['The Curtain - Live From Dordrecht, Het Energiehuis / 2014' 'Snarky Puppy' list(['contemporary jazz', 'funk rock', 'jazz', 'progressive jazz fusion']) 'https://open.spotify.com/track/29ls98FgNdZHbmqdQeF7E6'] ['Black Is the Soul' 'Korn' list(['alternative metal', 'funk metal', 'hard rock', 'nu metal', 'post-grunge', 'rap metal', 'rock']) 'https://open.spotify.com/track/1wYXIhJXqlIAGS6sbzpTL6'] ['Monarchy of Roses' 'Red Hot Chili Peppers' list(['alternative rock', 'funk metal', 'funk rock', 'permanent wave', 'rock']) 'https://open.spotify.com/track/16Bf9uR4PI2dCoXDIjs0cP'] ['Take It! (feat. Bernard Wright)' 'Snarky Puppy' list(['contemporary jazz', 'funk rock', 'jazz', 'progressive jazz fusion']) 'https://open.spotify.com/track/60XlahpurnbMWZkBZPNloO'] ["Maggie's Farm" 'Rage Against The Machine' list(['alternative metal', 'alternative rock', 'conscious hip hop', 'funk metal', 'hard rock', 'nu metal', 'political hip hop', 'post-grunge', 'rap metal', 'rap rock', 'rock']) 'https://open.spotify.com/track/2t0tVmiSkHWvKgojpjO21Z'] ['Whitecap' 'Snarky Puppy' list(['contemporary jazz', 'funk rock', 'jazz', 'progressive jazz fusion']) 'https://open.spotify.com/track/4fEdeCUuzDQwfZo3RP3Idf'] ['Out In L.A. - 2002 Digital Remaster' 'Red Hot Chili Peppers' list(['alternative rock', 'funk metal', 'funk rock', 'permanent wave', 'rock']) 'https://open.spotify.com/track/0ytvC9tG03obop9RXqG3af'] ['Police Station' 'Red Hot Chili Peppers' list(['alternative rock', 'funk metal', 'funk rock', 'permanent wave', 'rock']) 'https://open.spotify.com/track/6DfWymHzCYOH2ABUuHFaMe'] ["(Don't) Give Hate a Chance" 'Jamiroquai' list(['dance pop']) 'https://open.spotify.com/track/2IJk5fQC6Qiu2gRrCR2977']] Cluster 2 [['Question!' 'System Of A Down' list(['alternative metal', 'nu metal', 'rap metal', 'rock']) 'https://open.spotify.com/track/6y2DHyCYf6azhUfXmnuH6w'] ['Ticks & Leeches' 'TOOL' list(['alternative metal', 'art rock', 'nu metal', 'post-grunge', 'progressive metal', 'progressive rock', 'rock']) 'https://open.spotify.com/track/0Cnx6PGogxIE2RnDcnoeK8'] ['Absolomb' 'Periphery' list(['djent', 'melodic metalcore', 'progressive metal', 'progressive metalcore']) 'https://open.spotify.com/track/3cWMNY5QEgeFl7WOfn4gf3'] ['§1 - Radio Edit' 'Opeth' list(['alternative metal', 'metal', 'progressive death metal', 'progressive metal', 'swedish metal', 'swedish progressive metal']) 'https://open.spotify.com/track/1wkWkXCdHR4nGViHA3I9Yc'] ['Pneuma' 'TOOL' list(['alternative metal', 'art rock', 'nu metal', 'post-grunge', 'progressive metal', 'progressive rock', 'rock']) 'https://open.spotify.com/track/03sEzk1VyrUZSgyhoQR0LZ'] ['Supernaut' 'Black Sabbath' list(['album rock', 'alternative metal', 'birmingham metal', 'classic rock', 'hard rock', 'metal', 'rock', 'stoner rock', 'uk doom metal']) 'https://open.spotify.com/track/0qkm99RLAyoLBqWDc4fCF1'] ['Puzzle Box' 'Haken' list(['progressive metal']) 'https://open.spotify.com/track/4ihVN9xqrZvN0dlnrbFoxa'] ['Washington Is Next!' 'Megadeth' list(['alternative metal', 'hard rock', 'melodic thrash', 'metal', 'old school thrash', 'rock', 'speed metal', 'thrash metal']) 'https://open.spotify.com/track/1jLBjt2naw4K787bJ16iCI'] ['Headlong Flight (with Clockwork Angels String Ensemble) / Drumbastica [Drum Solo] [Live on Clockwork Angels Tour]' 'Rush' list(['album rock', 'canadian metal', 'classic canadian rock', 'classic rock', 'hard rock', 'progressive rock', 'rock']) 'https://open.spotify.com/track/0PQdxnf0ddYp6lnHB81db8'] ['Origin - Concealing Fate, Pt. 6' 'TesseracT' list(['djent', 'progressive metal']) 'https://open.spotify.com/track/2CpRWY45BkoUzWZgrSRD0k']] Cluster 3 [['No One to Depend On' 'Santana' list(['blues rock', 'classic rock', 'mexican classic rock']) 'https://open.spotify.com/track/74lwRyZECS8PQOCYyHKje4'] ["Get 'Em Out By Friday - Digital Remastered 2008" 'Genesis' list(['album rock', 'art rock', 'classic rock', 'hard rock', 'mellow gold', 'progressive rock', 'rock', 'soft rock', 'symphonic rock']) 'https://open.spotify.com/track/3UyzuKnR2ZRMvPOYpWri2I'] ['Red Miso' 'Animals As Leaders' list(['djent', 'instrumental djent', 'instrumental rock', 'jazz metal', 'progressive jazz fusion', 'progressive metal']) 'https://open.spotify.com/track/2qJpcZnMUQNOdAPxsuhFFe'] ['Teflon' 'The Mars Volta' list(['el paso indie', 'garage rock']) 'https://open.spotify.com/track/4fBAErbYMr2Gq478FPL86D'] ['For the Damaged' 'Blonde Redhead' list(['dream pop', 'indie rock', 'nu gaze', 'shoegaze']) 'https://open.spotify.com/track/14E52ObKfWYZNWJRXcU3mu'] ['Jimmy' 'TOOL' list(['alternative metal', 'art rock', 'nu metal', 'post-grunge', 'progressive metal', 'progressive rock', 'rock']) 'https://open.spotify.com/track/09KJ3XQZ9RQ5prbSp59Wbc'] ['Cassandra Gemini: Plant A Nail In the Navel Stream' 'The Mars Volta' list(['el paso indie', 'garage rock']) 'https://open.spotify.com/track/6dFLb8SpYYlg4NxyJW7535'] ['The Musical Box - Remastered 2008' 'Genesis' list(['album rock', 'art rock', 'classic rock', 'hard rock', 'mellow gold', 'progressive rock', 'rock', 'soft rock', 'symphonic rock']) 'https://open.spotify.com/track/4QqQ6S3pDocGvUzF4fSVuS'] ["Babe I'm Gonna Leave You - Remaster" 'Led Zeppelin' list(['album rock', 'classic rock', 'hard rock', 'rock']) 'https://open.spotify.com/track/38KeSzb6FZYSogDXpc7xz8'] ['Break on Through (To the Other Side)' 'The Doors' list(['acid rock', 'album rock', 'classic rock', 'hard rock', 'psychedelic rock', 'rock']) 'https://open.spotify.com/track/6ToM0uwxtPKo9CMpbPGYvM']] Cluster 4 [['Got the Life' 'Korn' list(['alternative metal', 'funk metal', 'hard rock', 'nu metal', 'post-grunge', 'rap metal', 'rock']) 'https://open.spotify.com/track/6nJPHXRpKYv2yqtalEjKy5'] ['Fracture' 'King Crimson' list(['art rock', 'instrumental rock', 'jazz rock', 'progressive rock', 'psychedelic rock', 'symphonic rock', 'zolo']) 'https://open.spotify.com/track/5ipS4tCSxn0z7NL6To7umt'] ['First It Giveth' 'Queens of the Stone Age' list(['alternative metal', 'alternative rock', 'grunge', 'modern rock', 'palm desert scene', 'rock', 'stoner metal', 'stoner rock']) 'https://open.spotify.com/track/0UEMTz9APWfoJHdlXDeIzm'] ['Fake Plastic Trees' 'Radiohead' list(['alternative rock', 'art rock', 'melancholia', 'oxford indie', 'permanent wave', 'rock']) 'https://open.spotify.com/track/73CKjW3vsUXRpy3NnX4H7F'] ['Hamburger Train' 'Primus' list(['alternative metal', 'alternative rock', 'funk metal', 'funk rock', 'grunge', 'nu metal']) 'https://open.spotify.com/track/0tdWYwH8xIOoSeNoEz7QqV'] ['Microdeniselasalle (Microtonal-Southern Soul)' 'MonoNeon' list(['funktronica', 'microtonal']) 'https://open.spotify.com/track/2KeAeYoJujNqfYuX50Mrzi'] ['Lacquer Head' 'Primus' list(['alternative metal', 'alternative rock', 'funk metal', 'funk rock', 'grunge', 'nu metal']) 'https://open.spotify.com/track/6GAasbp33LAShmzwcaaFKC'] ['My Iron Lung' 'Radiohead' list(['alternative rock', 'art rock', 'melancholia', 'oxford indie', 'permanent wave', 'rock']) 'https://open.spotify.com/track/0jyikFM0Umv0KlnrOEKtTG'] ['Pneuma' 'TOOL' list(['alternative metal', 'art rock', 'nu metal', 'post-grunge', 'progressive metal', 'progressive rock', 'rock']) 'https://open.spotify.com/track/03sEzk1VyrUZSgyhoQR0LZ'] ['Serve The Servants' 'Nirvana' list(['grunge', 'permanent wave', 'rock']) 'https://open.spotify.com/track/3w5Ekq9O8g0gGFEsT77Ydi']] Cluster 5 [['Starless' 'King Crimson' list(['art rock', 'instrumental rock', 'jazz rock', 'progressive rock', 'psychedelic rock', 'symphonic rock', 'zolo']) 'https://open.spotify.com/track/1Kt1j54YhvP39PnSQjU8H3'] ['Formentera Lady' 'King Crimson' list(['art rock', 'instrumental rock', 'jazz rock', 'progressive rock', 'psychedelic rock', 'symphonic rock', 'zolo']) 'https://open.spotify.com/track/5StX8dqW6uGKh1j9FN49yA'] ['Suda' 'Chon' list(['instrumental math rock', 'instrumental rock', 'progressive jazz fusion', 'san diego indie']) 'https://open.spotify.com/track/2FDSwTY9PHc5dS8jWzXUq0'] ['Ectogenesis' 'Animals As Leaders' list(['djent', 'instrumental djent', 'instrumental rock', 'jazz metal', 'progressive jazz fusion', 'progressive metal']) 'https://open.spotify.com/track/2iG3f8JUFgLIVddPlP1zap'] ['Flamethrower' 'King Gizzard & The Lizard Wizard' list(['australian psych', 'double drumming', 'microtonal', 'neo-psychedelic']) 'https://open.spotify.com/track/5XkNUvzFQUCBu39tWRVO15'] ['Witchcraft' 'King Gizzard & The Lizard Wizard' list(['australian psych', 'double drumming', 'microtonal', 'neo-psychedelic']) 'https://open.spotify.com/track/0mQU3EuOIqMpRg4qbXDxda'] ['Pink Maggit' 'Deftones' list(['alternative metal', 'nu metal', 'rap metal', 'rock', 'sacramento indie']) 'https://open.spotify.com/track/0FrT8a39eo6siL1yIxVGTP'] ['Whitecap' 'Snarky Puppy' list(['contemporary jazz', 'funk rock', 'jazz', 'progressive jazz fusion']) 'https://open.spotify.com/track/4fEdeCUuzDQwfZo3RP3Idf'] ['Tempting Time' 'Animals As Leaders' list(['djent', 'instrumental djent', 'instrumental rock', 'jazz metal', 'progressive jazz fusion', 'progressive metal']) 'https://open.spotify.com/track/3TwKLIOl2hIPDE7z7rvOH3'] ['The Curtain - Live From Dordrecht, Het Energiehuis / 2014' 'Snarky Puppy' list(['contemporary jazz', 'funk rock', 'jazz', 'progressive jazz fusion']) 'https://open.spotify.com/track/29ls98FgNdZHbmqdQeF7E6']] Cluster 6 [['Never Walk Alone..A Call to Arms - 2019 - Remaster' 'Megadeth' list(['alternative metal', 'hard rock', 'melodic thrash', 'metal', 'old school thrash', 'rock', 'speed metal', 'thrash metal']) 'https://open.spotify.com/track/5IwzKBvIVsXKmAw91PAr0R'] ['I (21 Minute Track)' 'Meshuggah' list(['alternative metal', 'djent', 'groove metal', 'metal', 'nu metal', 'progressive groove metal', 'swedish metal', 'technical groove metal', 'technical thrash']) 'https://open.spotify.com/track/6mnLcQF5GYlFycGu2htkvb'] ['Rust In Peace...Polaris - 2004 Remix' 'Megadeth' list(['alternative metal', 'hard rock', 'melodic thrash', 'metal', 'old school thrash', 'rock', 'speed metal', 'thrash metal']) 'https://open.spotify.com/track/0803SWqmIJGvZ15B8zsewn'] ['Lucretia - 2004 Remix' 'Megadeth' list(['alternative metal', 'hard rock', 'melodic thrash', 'metal', 'old school thrash', 'rock', 'speed metal', 'thrash metal']) 'https://open.spotify.com/track/1Bi6Me45Xiqy4kYUGGwBTv'] ['Domination' 'Pantera' list(['alternative metal', 'groove metal', 'hard rock', 'metal', 'nu metal', 'old school thrash', 'rock', 'texas metal']) 'https://open.spotify.com/track/769cLRTw2y6KRdkFWFkxtu'] ['Combustion' 'Meshuggah' list(['alternative metal', 'djent', 'groove metal', 'metal', 'nu metal', 'progressive groove metal', 'swedish metal', 'technical groove metal', 'technical thrash']) 'https://open.spotify.com/track/67FLsDJIE22Rc6lWUIPPsD'] ['Poisonous Shadows' 'Megadeth' list(['alternative metal', 'hard rock', 'melodic thrash', 'metal', 'old school thrash', 'rock', 'speed metal', 'thrash metal']) 'https://open.spotify.com/track/1RdDBpGJDIsJTBou1QsJ9B'] ['Cowboys from Hell' 'Pantera' list(['alternative metal', 'groove metal', 'hard rock', 'metal', 'nu metal', 'old school thrash', 'rock', 'texas metal']) 'https://open.spotify.com/track/2SgbR6ttzoNlCRGQOKjrop'] ['Lethargica' 'Meshuggah' list(['alternative metal', 'djent', 'groove metal', 'metal', 'nu metal', 'progressive groove metal', 'swedish metal', 'technical groove metal', 'technical thrash']) 'https://open.spotify.com/track/4FvuNTv7dcQtoByEePExgW'] ['Nebulous' 'Meshuggah' list(['alternative metal', 'djent', 'groove metal', 'metal', 'nu metal', 'progressive groove metal', 'swedish metal', 'technical groove metal', 'technical thrash']) 'https://open.spotify.com/track/0nZJnJjDyKKpb38BC2EF4d']] Cluster 7 [['Matador' 'Brontide' list(['british math rock', 'math rock']) 'https://open.spotify.com/track/2CJ8rfT4LMRDUW1TvOMU75'] ['Youth Against Fascism' 'Sonic Youth' list(['alternative rock', 'noise pop', 'noise rock', 'rock', 'shoegaze']) 'https://open.spotify.com/track/7ewAyUnxyKXFBAm4HDgaeN'] ['The Pot' 'TOOL' list(['alternative metal', 'art rock', 'nu metal', 'post-grunge', 'progressive metal', 'progressive rock', 'rock']) 'https://open.spotify.com/track/1lATXTBJDHwawvT1UfxWu3'] ['Wake Up Dead - 2004 Remaster' 'Megadeth' list(['alternative metal', 'hard rock', 'melodic thrash', 'metal', 'old school thrash', 'rock', 'speed metal', 'thrash metal']) 'https://open.spotify.com/track/1I3qfFMraXE0kAPtRERpok'] ['Like a Stone' 'Audioslave' list(['alternative metal', 'alternative rock', 'grunge', 'hard rock', 'nu metal', 'permanent wave', 'post-grunge', 'rock', 'supergroup']) 'https://open.spotify.com/track/3YuaBvuZqcwN3CEAyyoaei'] ['Butterflies and Hurricanes - Live from Wembley Stadium' 'Muse' list(['alternative rock', 'modern rock', 'permanent wave', 'rock']) 'https://open.spotify.com/track/0vT4dGgToozNPUsEdVmeRF'] ['Mr. Brightside' 'The Killers' list(['alternative rock', 'dance rock', 'modern rock', 'permanent wave', 'rock']) 'https://open.spotify.com/track/3n3Ppam7vgaVa1iaRUc9Lp'] ['Peter Gunn (Live 1977)' 'Emerson, Lake & Palmer' list(['album rock', 'art rock', 'blues rock', 'classic rock', 'folk rock', 'hard rock', 'mellow gold', 'progressive rock', 'singer-songwriter', 'soft rock', 'symphonic rock', 'synth prog']) 'https://open.spotify.com/track/1XvnKIRg3wxDkN6MaW8KZC'] ['The Czar: Usurper / Escape / Martyr / Spiral' 'Mastodon' list(['alternative metal', 'atlanta metal', 'metal', 'progressive groove metal', 'progressive sludge', 'sludge metal', 'stoner metal', 'stoner rock']) 'https://open.spotify.com/track/2LMjQnKH7sQzOD0l8q6eWz'] ['Vicarious' 'TOOL' list(['alternative metal', 'art rock', 'nu metal', 'post-grunge', 'progressive metal', 'progressive rock', 'rock']) 'https://open.spotify.com/track/65ShmiE5aLBdcIGr7tHX35']] Cluster 8 [['Icarus Lives - Instrumental' 'Periphery' list(['djent', 'melodic metalcore', 'progressive metal', 'progressive metalcore']) 'https://open.spotify.com/track/49cn0SCwOBLCM0XaSQtKCl'] ['Lethargica' 'Meshuggah' list(['alternative metal', 'djent', 'groove metal', 'metal', 'nu metal', 'progressive groove metal', 'swedish metal', 'technical groove metal', 'technical thrash']) 'https://open.spotify.com/track/4FvuNTv7dcQtoByEePExgW'] ['Tourniquet' 'TesseracT' list(['djent', 'progressive metal']) 'https://open.spotify.com/track/3wynWk9f1EopMj2Y9P7ZOe'] ['Kascade' 'Animals As Leaders' list(['djent', 'instrumental djent', 'instrumental rock', 'jazz metal', 'progressive jazz fusion', 'progressive metal']) 'https://open.spotify.com/track/7hY2Kc7Hvu0BudOoQwu8Ez'] ['Of Energy - Embers' 'TesseracT' list(['djent', 'progressive metal']) 'https://open.spotify.com/track/3w1eU4ZoEM4uiRJ0d3x2ik'] ['The Grey' 'TesseracT' list(['djent', 'progressive metal']) 'https://open.spotify.com/track/4SclJtfYGNmBAzMky2Sril'] ['Ectogenesis' 'Animals As Leaders' list(['djent', 'instrumental djent', 'instrumental rock', 'jazz metal', 'progressive jazz fusion', 'progressive metal']) 'https://open.spotify.com/track/2iG3f8JUFgLIVddPlP1zap'] ['An Infinite Regression' 'Animals As Leaders' list(['djent', 'instrumental djent', 'instrumental rock', 'jazz metal', 'progressive jazz fusion', 'progressive metal']) 'https://open.spotify.com/track/6TusaYvpkgyJhJ8tvXo03P'] ['Of Mind - Exile' 'TesseracT' list(['djent', 'progressive metal']) 'https://open.spotify.com/track/23G48RKz9osJ0JINJ9fc8Y'] ['Eden' 'TesseracT' list(['djent', 'progressive metal']) 'https://open.spotify.com/track/2tBd67cpkdg8jpr8bWI8fT']] Cluster 9 [['Darkshines' 'Muse' list(['alternative rock', 'modern rock', 'permanent wave', 'rock']) 'https://open.spotify.com/track/2lS9rFLUICaha7SzIAEZnk'] ['Uprising' 'Muse' list(['alternative rock', 'modern rock', 'permanent wave', 'rock']) 'https://open.spotify.com/track/4VqPOruhp5EdPBeR92t6lQ'] ['No Surprises' 'Radiohead' list(['alternative rock', 'art rock', 'melancholia', 'oxford indie', 'permanent wave', 'rock']) 'https://open.spotify.com/track/10nyNJ6zNy2YVYLrcwLccB'] ['Apocalypse Please' 'Muse' list(['alternative rock', 'modern rock', 'permanent wave', 'rock']) 'https://open.spotify.com/track/6z0QCh7CTU9bE5C7TAHK4R'] ['The Bends' 'Radiohead' list(['alternative rock', 'art rock', 'melancholia', 'oxford indie', 'permanent wave', 'rock']) 'https://open.spotify.com/track/7oDFvnqXkXuiZa1sACXobj'] ['Stockholm Syndrome' 'Muse' list(['alternative rock', 'modern rock', 'permanent wave', 'rock']) 'https://open.spotify.com/track/5VVWgWH8HFLAtM8lbttvn9'] ['Exit Music (For a Film)' 'Radiohead' list(['alternative rock', 'art rock', 'melancholia', 'oxford indie', 'permanent wave', 'rock']) 'https://open.spotify.com/track/4Na0siMtWOW9pJoWJ1Ponv'] ['You' 'Radiohead' list(['alternative rock', 'art rock', 'melancholia', 'oxford indie', 'permanent wave', 'rock']) 'https://open.spotify.com/track/5KZ0qobWEFl892YjIC02SE'] ['Exit Music (For A Film)' 'Radiohead' list(['alternative rock', 'art rock', 'melancholia', 'oxford indie', 'permanent wave', 'rock']) 'https://open.spotify.com/track/0z1o5L7HJx562xZSATcIpY'] ['Starlight - Live from Wembley Stadium' 'Muse' list(['alternative rock', 'modern rock', 'permanent wave', 'rock']) 'https://open.spotify.com/track/6wiHSEXWFIspKRS9SoLrUE']]
Cluster 0 seems somewhat strange at first, as most of it seems mixed metal, and it could be argued if that could be categorized as "nu-metal". Some people would definitevely say "no", but as the analysis uses genres decided by spotify, it's ok with respect to the genres provided (that may not be the "real" ones).
Cluster 1 seems ok, most of it comprises songs with a rhytmic element very present, as well as the improvisational aspect.
Cluster 2 also seems ok, with mostly thrash metal, as well as songs with a "thrashy" component, although the whole song or artist may not be "pure" thrash.
Cluster 3 is ok, and being one of the most "big" groups (cluster with several elements in it), it should comprise several styles that could be viewed as "general rock".
Cluster 4 also seems ok, although being one with more "weird" genres, it's also one of the most open to interpretation.
Cluster 5 seems also ok, with songs with a "jazzy" composition.
Cluster 6 has more hip hop related genred, but one element of it (neo-psychedelic) is a little off with respect the rest. The cluster seems ok, but some songs could be debated if they pertain to the genres it says.
Cluster 7 seems ok also, with some songs that definitevely are not "indie".
Cluster 8 seems ok also, being a parallel to cluster 3, but with metal instead of rock.
Finally, cluster 9 seems also good, with ryhtmic and repetition as main characteristic.
In conclusion, this approach is the most simple, and gives somewhat sensible results. Some tracks may be off in some cluster, and one of the main causes of this problem is the fact that genre is specified for the whole artist, so if an artist has diversity in genres, it may have songs in some clusters that don't seem ok, but the whole artist could be categorized in that way.
for col in categorical_cols:
fig = go.Figure()
for i in range(0, 10):
_df = df_exploded.loc[df_exploded["cluster_NLP"] == str(i), :]
fig.add_trace(go.Histogram(x=_df[col], histnorm="percent", name=f"Cluster {i}"))
fig.update_layout(
xaxis_title_text=col,
yaxis_title_text='%'
)
fig.update_traces(opacity=0.75)
fig.show()
for col in numerical_cols:
if col in track_info.columns and col in public_data.columns:
fig = go.Figure()
for i in range(0, 10):
_df = df_exploded.loc[df_exploded["cluster_NLP"] == str(i), :]
bin_start = _df[col].min()
bin_end = _df[col].max()
bin_size = abs(bin_end - bin_start) / 20
fig.add_trace(go.Histogram(x=_df[col], histnorm="percent", name=f"Cluster {i}"))
fig.update_layout(
barmode="overlay",
xaxis_title_text=col,
yaxis_title_text='%'
)
fig.update_traces(opacity=0.75)
fig.show()
In the next section, the actual features of the tracks are used to try to construct clusters.
First, an EDA is done, to see correlations between numerical features and select those who are not correlated, and construct new features based on combinations of some. The objective is to have a final set of uncorrelated features and use them as input of the clustering algorithm.
track_info["genres"] = track_info["genres"].apply(lambda x: list(set(x) & set(genres)))
track_info = track_info.loc[track_info["genres"].str.len() != 0]
correlations = track_info[numerical_cols].corr().abs()
fig = px.imshow(correlations, text_auto=True)
fig.update_layout(width=1300, height=1200, autosize=False)
fig.show()
Most of the features seem uncorrelated, but some of them have correlation. Number of sections and loudnesd are correlated with several other features, so these are dropped out completely. The following pairs are correlated between them, so for each one, a new feature is calculated multiplying the two:
- Valence and danceability
- Energy and acousticness
- Energy and speechiness
- Key changes and mode changes
track_info_new_features = track_info.copy()
useless_cols = ["num_sections", "loudness"]
numerical_cols_extra = list(set(numerical_cols).difference(useless_cols))
track_info_new_features = track_info_new_features[track_info_new_features.columns.difference(useless_cols)]
for pair in (("valence", "danceability"), ("energy", "acousticness"), ("energy", "speechiness"), ("key_changes", "mode_changes")):
track_info_new_features[f"{pair[0]} - {pair[1]}"] = track_info_new_features[pair[0]] * track_info_new_features[pair[1]]
numerical_cols_extra.append(f"{pair[0]} - {pair[1]}")
correlations = track_info_new_features[numerical_cols_extra].corr().abs()
fig = px.imshow(correlations, text_auto=True)
fig.update_layout(width=1300, height=1200, autosize=False)
fig.show()
Mode changes and key changes seem correlated, but plotting them, it doesn't seem to exist a direct and clear relation between them, so for these, instead of constructing a new feature, they are used as they were originally.
(values for these features are normalized with the song duration)
track_info.loc[:, ["mode_changes", "key_changes"]].plot(kind="scatter", y="mode_changes", x="key_changes")
<Axes: xlabel='key_changes', ylabel='mode_changes'>
The final correlation matrix is shown, where it can be seen that almost of the features are uncorrelated between them, so they are good candidated for clustering.
useless_cols = ["num_sections", "loudness", "energy", "acousticness", "valence", "danceability", "speechiness"]
numerical_cols_extra = list(set(numerical_cols).difference(useless_cols))
numerical_cols_extra = numerical_cols_extra + ["valence - danceability", "energy - acousticness", "energy - speechiness"]
track_info_new_features = track_info_new_features[track_info_new_features.columns.difference(useless_cols)]
correlations = track_info_new_features[numerical_cols_extra].corr().abs()
fig = px.imshow(correlations, text_auto=True)
fig.update_layout(width=1300, height=1200, autosize=False)
fig.show()
py.plot(fig, filename="correlations_clean_clustering", auto_open=False)
'https://plotly.com/~jcf94/7/'
These features are scaled, and the categorical features are also added and encoded using one-hot enconding.
track_info_new_features = track_info.copy()
useless_cols = ["num_sections", "loudness", "energy", "acousticness", "valence", "danceability", "speechiness"]
combinations_cols = [("valence", "danceability"), ("energy", "acousticness"), ("energy", "speechiness")]
numerical_cols_extra = list(set(numerical_cols).difference(useless_cols))
for pair in combinations_cols:
track_info_new_features[f"{pair[0]} - {pair[1]}"] = track_info_new_features[pair[0]] * track_info_new_features[pair[1]]
numerical_cols_extra.append(f"{pair[0]} - {pair[1]}")
numerical_cols_extra = numerical_cols_extra + categorical_cols + [f"{pair[0]} - {pair[1]}" for pair in combinations_cols]
X = track_info_new_features[numerical_cols_extra]
X = pd.get_dummies(X, columns=categorical_cols)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
KMeans is applied to the features to try to cluster them. 15 groups are used.
kmeans = KMeans(n_clusters=10, random_state=random_state)
kmeans.fit(X)
labels = kmeans.labels_
track_info_labeled = track_info_new_features.copy()
track_info_labeled["cluster"] = labels
genres = track_info_new_features["genres"]
# Print the genres grouped by cluster
clusters = {}
for genres, label in zip(genres, labels):
if label not in clusters:
clusters[label] = []
clusters[label].extend(genres)
for k, v in clusters.items():
clusters[k] = Counter(v)
clusters = dict(sorted(clusters.items()))
for cluster_id, genre_list in clusters.items():
print(f"Cluster {cluster_id} (length: {len(genre_list)}): {genre_list}\n")
Cluster 0 (length: 79): Counter({'rock': 130, 'alternative rock': 99, 'permanent wave': 90, 'funk metal': 47, 'alternative metal': 47, 'funk rock': 44, 'modern rock': 34, 'nu metal': 33, 'art rock': 30, 'melancholia': 22, 'metal': 21, 'oxford indie': 20, 'hard rock': 20, 'instrumental rock': 18, 'progressive metal': 18, 'post-grunge': 15, 'progressive rock': 15, 'classic rock': 15, 'grunge': 14, 'album rock': 13, 'groove metal': 13, 'progressive groove metal': 12, 'french metal': 12, 'rap metal': 11, 'french death metal': 11, 'el paso indie': 10, 'dance pop': 9, 'symphonic rock': 9, 'djent': 8, 'double drumming': 7, 'australian psych': 7, 'microtonal': 7, 'neo-psychedelic': 7, 'indie rock': 6, 'progressive jazz fusion': 6, 'old school thrash': 6, 'garage rock': 6, 'psychedelic rock': 6, 'thrash metal': 5, 'instrumental funk': 4, 'melodic thrash': 4, 'speed metal': 4, 'swedish metal': 4, 'jazz metal': 3, 'instrumental djent': 3, 'swedish progressive metal': 3, 'soft rock': 3, 'mellow gold': 3, 'supergroup': 3, 'instrumental math rock': 3, 'jam band': 3, 'glam rock': 3, 'canadian metal': 2, 'classic canadian rock': 2, 'stoner rock': 2, 'stoner metal': 2, 'palm desert scene': 2, 'political hip hop': 2, 'conscious hip hop': 2, 'rap rock': 2, 'uk post-punk': 2, 'new wave': 2, 'jazz funk': 2, 'jazz fusion': 2, 'jazz': 2, 'electric bass': 2, 'sacramento indie': 2, 'math rock': 2, 'funk': 1, 'p funk': 1, 'technical thrash': 1, 'technical groove metal': 1, 'zolo': 1, 'jazz rock': 1, 'trip hop': 1, 'blues rock': 1, 'contemporary jazz': 1, 'post-rock': 1, 'noise pop': 1})
Cluster 1 (length: 44): Counter({'progressive rock': 22, 'rock': 17, 'art rock': 16, 'progressive metal': 16, 'classic rock': 14, 'album rock': 13, 'symphonic rock': 12, 'hard rock': 9, 'alternative metal': 9, 'soft rock': 6, 'metal': 6, 'psychedelic rock': 6, 'nu metal': 5, 'funk rock': 4, 'post-grunge': 4, 'zolo': 3, 'jazz rock': 3, 'instrumental rock': 3, 'swedish metal': 3, 'progressive death metal': 3, 'swedish progressive metal': 3, 'canadian metal': 3, 'classic canadian rock': 3, 'mellow gold': 3, 'djent': 2, 'contemporary jazz': 2, 'jazz': 2, 'progressive jazz fusion': 2, 'double drumming': 2, 'australian psych': 2, 'microtonal': 2, 'neo-psychedelic': 2, 'garage rock': 2, 'el paso indie': 2, 'funk metal': 1, 'alternative rock': 1, 'grunge': 1, 'stoner rock': 1, 'progressive groove metal': 1, 'stoner metal': 1, 'blues rock': 1, 'glam rock': 1, 'funk': 1, 'p funk': 1})
Cluster 2 (length: 69): Counter({'rock': 48, 'alternative metal': 35, 'alternative rock': 26, 'nu metal': 23, 'metal': 22, 'hard rock': 20, 'progressive metal': 19, 'permanent wave': 17, 'funk rock': 13, 'grunge': 12, 'progressive rock': 11, 'art rock': 10, 'modern rock': 10, 'old school thrash': 10, 'djent': 10, 'groove metal': 10, 'thrash metal': 9, 'classic rock': 9, 'progressive groove metal': 9, 'funk metal': 8, 'instrumental rock': 8, 'el paso indie': 7, 'album rock': 7, 'french metal': 7, 'french death metal': 7, 'jazz': 6, 'progressive jazz fusion': 6, 'symphonic rock': 6, 'garage rock': 6, 'melodic thrash': 6, 'speed metal': 6, 'math rock': 5, 'contemporary jazz': 5, 'psychedelic rock': 5, 'stoner rock': 4, 'palm desert scene': 4, 'instrumental math rock': 4, 'post-grunge': 4, 'oxford indie': 3, 'melancholia': 3, 'stoner metal': 3, 'rap metal': 3, 'jazz funk': 3, 'jazz fusion': 3, 'electric bass': 3, 'instrumental funk': 3, 'sacramento indie': 2, 'microtonal': 2, 'dance pop': 2, 'blues rock': 2, 'zolo': 2, 'jazz rock': 2, 'technical thrash': 2, 'swedish metal': 2, 'technical groove metal': 2, 'uk post-punk': 2, 'new wave': 2, 'supergroup': 1, 'jam band': 1, 'mellow gold': 1, 'singer-songwriter': 1, 'funk': 1, 'p funk': 1, 'glam rock': 1, 'political hip hop': 1, 'conscious hip hop': 1, 'rap rock': 1, 'australian psych': 1, 'post-rock': 1})
Cluster 3 (length: 28): Counter({'progressive metal': 5, 'progressive rock': 4, 'album rock': 2, 'symphonic rock': 2, 'art rock': 2, 'classic rock': 2, 'metal': 2, 'double drumming': 1, 'australian psych': 1, 'microtonal': 1, 'neo-psychedelic': 1, 'garage rock': 1, 'el paso indie': 1, 'soft rock': 1, 'mellow gold': 1, 'hard rock': 1, 'blues rock': 1, 'singer-songwriter': 1, 'technical thrash': 1, 'progressive groove metal': 1, 'nu metal': 1, 'swedish metal': 1, 'groove metal': 1, 'djent': 1, 'alternative metal': 1, 'technical groove metal': 1, 'rock': 1, 'psychedelic rock': 1})
Cluster 4 (length: 61): Counter({'rock': 23, 'progressive metal': 21, 'progressive rock': 18, 'alternative metal': 16, 'nu metal': 14, 'hard rock': 14, 'classic rock': 14, 'album rock': 12, 'metal': 12, 'art rock': 11, 'alternative rock': 10, 'symphonic rock': 9, 'funk rock': 7, 'soft rock': 7, 'djent': 7, 'mellow gold': 6, 'grunge': 5, 'jazz': 5, 'progressive groove metal': 5, 'groove metal': 5, 'funk metal': 4, 'post-rock': 3, 'instrumental rock': 3, 'contemporary jazz': 3, 'progressive jazz fusion': 3, 'permanent wave': 3, 'french metal': 3, 'french death metal': 3, 'double drumming': 3, 'australian psych': 3, 'microtonal': 3, 'neo-psychedelic': 3, 'jam band': 3, 'indie rock': 3, 'melancholia': 3, 'garage rock': 3, 'el paso indie': 3, 'modern rock': 3, 'post-grunge': 2, 'technical thrash': 2, 'swedish metal': 2, 'technical groove metal': 2, 'supergroup': 2, 'thrash metal': 2, 'old school thrash': 2, 'glam rock': 2, 'jazz funk': 2, 'jazz fusion': 2, 'zolo': 2, 'psychedelic rock': 2, 'canadian metal': 2, 'classic canadian rock': 2, 'stoner rock': 1, 'sacramento indie': 1, 'rap metal': 1, 'instrumental math rock': 1, 'math rock': 1, 'noise pop': 1, 'instrumental funk': 1, 'jazz rock': 1, 'electric bass': 1})
Cluster 5 (length: 82): Counter({'rock': 75, 'alternative rock': 54, 'alternative metal': 47, 'permanent wave': 38, 'nu metal': 32, 'progressive metal': 31, 'progressive rock': 25, 'hard rock': 24, 'art rock': 23, 'metal': 21, 'classic rock': 18, 'instrumental rock': 17, 'funk rock': 17, 'grunge': 16, 'funk metal': 16, 'djent': 15, 'modern rock': 14, 'album rock': 14, 'melancholia': 12, 'symphonic rock': 11, 'groove metal': 11, 'psychedelic rock': 10, 'progressive groove metal': 10, 'post-grunge': 10, 'el paso indie': 10, 'progressive jazz fusion': 10, 'oxford indie': 9, 'dance pop': 9, 'rap metal': 7, 'microtonal': 7, 'jazz': 7, 'french metal': 7, 'french death metal': 7, 'garage rock': 7, 'swedish metal': 7, 'stoner rock': 6, 'canadian metal': 6, 'classic canadian rock': 6, 'australian psych': 6, 'zolo': 6, 'jazz rock': 6, 'jazz fusion': 6, 'electric bass': 6, 'old school thrash': 6, 'trip hop': 5, 'double drumming': 5, 'neo-psychedelic': 5, 'jazz funk': 5, 'swedish progressive metal': 5, 'contemporary jazz': 5, 'melodic thrash': 5, 'thrash metal': 5, 'speed metal': 5, 'jazz metal': 5, 'instrumental djent': 5, 'stoner metal': 4, 'palm desert scene': 4, 'political hip hop': 4, 'conscious hip hop': 4, 'rap rock': 4, 'blues rock': 4, 'singer-songwriter': 4, 'uk post-punk': 4, 'new wave': 4, 'math rock': 4, 'sacramento indie': 3, 'funk': 3, 'p funk': 3, 'technical thrash': 3, 'technical groove metal': 3, 'supergroup': 3, 'soft rock': 3, 'mellow gold': 3, 'jam band': 3, 'instrumental funk': 3, 'instrumental math rock': 3, 'glam rock': 2, 'progressive death metal': 2, 'indie rock': 2, 'shoegaze': 2, 'acid rock': 1, 'noise pop': 1})
Cluster 6 (length: 33): Counter({'progressive metal': 14, 'progressive rock': 10, 'art rock': 9, 'alternative metal': 9, 'metal': 8, 'rock': 6, 'symphonic rock': 5, 'psychedelic rock': 5, 'nu metal': 5, 'zolo': 4, 'jazz rock': 4, 'instrumental rock': 4, 'post-grunge': 4, 'swedish metal': 4, 'garage rock': 4, 'el paso indie': 4, 'progressive death metal': 3, 'swedish progressive metal': 3, 'jazz': 2, 'djent': 2, 'funk rock': 1, 'contemporary jazz': 1, 'progressive jazz fusion': 1, 'hard rock': 1, 'jazz funk': 1, 'jazz fusion': 1, 'electric bass': 1, 'technical thrash': 1, 'progressive groove metal': 1, 'groove metal': 1, 'technical groove metal': 1, 'album rock': 1, 'classic rock': 1})
Cluster 7 (length: 11): Counter({'progressive metal': 3, 'progressive rock': 2, 'symphonic rock': 2, 'metal': 1, 'soft rock': 1, 'mellow gold': 1, 'album rock': 1, 'rock': 1, 'hard rock': 1, 'classic rock': 1, 'art rock': 1})
Cluster 8 (length: 74): Counter({'rock': 55, 'alternative rock': 39, 'alternative metal': 27, 'permanent wave': 23, 'nu metal': 20, 'hard rock': 18, 'funk rock': 17, 'classic rock': 16, 'album rock': 15, 'funk metal': 15, 'grunge': 13, 'psychedelic rock': 11, 'art rock': 10, 'progressive rock': 9, 'instrumental rock': 9, 'neo-psychedelic': 9, 'microtonal': 9, 'metal': 8, 'el paso indie': 8, 'double drumming': 8, 'australian psych': 8, 'modern rock': 8, 'symphonic rock': 7, 'progressive metal': 7, 'garage rock': 7, 'acid rock': 6, 'djent': 6, 'french metal': 6, 'rap metal': 6, 'noise pop': 5, 'shoegaze': 5, 'progressive groove metal': 4, 'french death metal': 4, 'groove metal': 4, 'instrumental funk': 4, 'oxford indie': 4, 'melancholia': 4, 'melodic thrash': 4, 'thrash metal': 4, 'old school thrash': 4, 'speed metal': 4, 'post-grunge': 4, 'soft rock': 3, 'mellow gold': 3, 'jazz fusion': 3, 'jazz': 3, 'progressive jazz fusion': 3, 'jazz funk': 2, 'electric bass': 2, 'jam band': 2, 'sacramento indie': 2, 'indie rock': 2, 'instrumental math rock': 2, 'singer-songwriter': 2, 'blues rock': 2, 'supergroup': 2, 'jazz metal': 2, 'instrumental djent': 2, 'trip hop': 1, 'canadian metal': 1, 'classic canadian rock': 1, 'funk': 1, 'p funk': 1, 'political hip hop': 1, 'conscious hip hop': 1, 'rap rock': 1, 'dance pop': 1, 'uk post-punk': 1, 'new wave': 1, 'post-rock': 1, 'math rock': 1, 'stoner rock': 1, 'stoner metal': 1, 'palm desert scene': 1})
Cluster 9 (length: 46): Counter({'progressive rock': 16, 'rock': 15, 'progressive metal': 13, 'art rock': 12, 'album rock': 10, 'hard rock': 10, 'classic rock': 10, 'symphonic rock': 9, 'metal': 8, 'alternative metal': 6, 'soft rock': 6, 'funk rock': 5, 'nu metal': 5, 'contemporary jazz': 4, 'jazz': 4, 'progressive jazz fusion': 4, 'post-grunge': 3, 'double drumming': 3, 'australian psych': 3, 'microtonal': 3, 'neo-psychedelic': 3, 'djent': 3, 'psychedelic rock': 3, 'french metal': 3, 'mellow gold': 2, 'instrumental rock': 2, 'canadian metal': 2, 'classic canadian rock': 2, 'thrash metal': 2, 'old school thrash': 2, 'dance pop': 2, 'progressive groove metal': 2, 'french death metal': 2, 'groove metal': 2, 'garage rock': 2, 'el paso indie': 2, 'funk': 1, 'p funk': 1, 'jam band': 1, 'zolo': 1, 'jazz rock': 1, 'post-rock': 1, 'shoegaze': 1, 'swedish metal': 1, 'progressive death metal': 1, 'swedish progressive metal': 1})
for cluster_id, genre_list in clusters.items():
genre_list_filtered = {k:v for k,v in genre_list.items() if v >= 5}
print(f"Cluster {cluster_id} (length: {len(genre_list_filtered)}): {genre_list_filtered}\n")
Cluster 0 (length: 39): {'rock': 130, 'modern rock': 34, 'indie rock': 6, 'alternative rock': 99, 'progressive jazz fusion': 6, 'instrumental rock': 18, 'dance pop': 9, 'permanent wave': 90, 'grunge': 14, 'oxford indie': 20, 'melancholia': 22, 'art rock': 30, 'funk metal': 47, 'funk rock': 44, 'post-grunge': 15, 'alternative metal': 47, 'metal': 21, 'progressive metal': 18, 'progressive rock': 15, 'album rock': 13, 'hard rock': 20, 'classic rock': 15, 'thrash metal': 5, 'old school thrash': 6, 'garage rock': 6, 'el paso indie': 10, 'progressive groove metal': 12, 'nu metal': 33, 'groove metal': 13, 'djent': 8, 'symphonic rock': 9, 'psychedelic rock': 6, 'rap metal': 11, 'double drumming': 7, 'australian psych': 7, 'microtonal': 7, 'neo-psychedelic': 7, 'french metal': 12, 'french death metal': 11}
Cluster 1 (length: 13): {'soft rock': 6, 'progressive rock': 22, 'album rock': 13, 'rock': 17, 'hard rock': 9, 'symphonic rock': 12, 'classic rock': 14, 'art rock': 16, 'nu metal': 5, 'alternative metal': 9, 'metal': 6, 'progressive metal': 16, 'psychedelic rock': 6}
Cluster 2 (length: 34): {'rock': 48, 'alternative rock': 26, 'permanent wave': 17, 'art rock': 10, 'grunge': 12, 'alternative metal': 35, 'modern rock': 10, 'math rock': 5, 'nu metal': 23, 'hard rock': 20, 'metal': 22, 'thrash metal': 9, 'old school thrash': 10, 'funk rock': 13, 'contemporary jazz': 5, 'jazz': 6, 'progressive jazz fusion': 6, 'funk metal': 8, 'progressive metal': 19, 'progressive rock': 11, 'symphonic rock': 6, 'djent': 10, 'garage rock': 6, 'el paso indie': 7, 'melodic thrash': 6, 'speed metal': 6, 'album rock': 7, 'psychedelic rock': 5, 'classic rock': 9, 'progressive groove metal': 9, 'french metal': 7, 'french death metal': 7, 'groove metal': 10, 'instrumental rock': 8}
Cluster 3 (length: 1): {'progressive metal': 5}
Cluster 4 (length: 20): {'progressive rock': 18, 'nu metal': 14, 'rock': 23, 'progressive metal': 21, 'alternative metal': 16, 'art rock': 11, 'alternative rock': 10, 'grunge': 5, 'funk rock': 7, 'jazz': 5, 'soft rock': 7, 'mellow gold': 6, 'album rock': 12, 'hard rock': 14, 'symphonic rock': 9, 'classic rock': 14, 'metal': 12, 'progressive groove metal': 5, 'groove metal': 5, 'djent': 7}
Cluster 5 (length: 55): {'stoner rock': 6, 'rock': 75, 'alternative rock': 54, 'grunge': 16, 'alternative metal': 47, 'modern rock': 14, 'progressive rock': 25, 'album rock': 14, 'hard rock': 24, 'canadian metal': 6, 'classic rock': 18, 'classic canadian rock': 6, 'rap metal': 7, 'nu metal': 32, 'instrumental rock': 17, 'symphonic rock': 11, 'psychedelic rock': 10, 'art rock': 23, 'funk metal': 16, 'funk rock': 17, 'trip hop': 5, 'double drumming': 5, 'australian psych': 6, 'microtonal': 7, 'neo-psychedelic': 5, 'zolo': 6, 'jazz rock': 6, 'metal': 21, 'progressive metal': 31, 'djent': 15, 'jazz funk': 5, 'jazz fusion': 6, 'jazz': 7, 'electric bass': 6, 'oxford indie': 9, 'melancholia': 12, 'permanent wave': 38, 'progressive groove metal': 10, 'french metal': 7, 'french death metal': 7, 'groove metal': 11, 'post-grunge': 10, 'garage rock': 7, 'el paso indie': 10, 'dance pop': 9, 'swedish metal': 7, 'swedish progressive metal': 5, 'contemporary jazz': 5, 'progressive jazz fusion': 10, 'melodic thrash': 5, 'thrash metal': 5, 'old school thrash': 6, 'speed metal': 5, 'jazz metal': 5, 'instrumental djent': 5}
Cluster 6 (length: 9): {'metal': 8, 'progressive metal': 14, 'progressive rock': 10, 'symphonic rock': 5, 'psychedelic rock': 5, 'art rock': 9, 'nu metal': 5, 'rock': 6, 'alternative metal': 9}
Cluster 7 (length: 0): {}
Cluster 8 (length: 31): {'progressive rock': 9, 'album rock': 15, 'symphonic rock': 7, 'classic rock': 16, 'art rock': 10, 'alternative rock': 39, 'rock': 55, 'hard rock': 18, 'acid rock': 6, 'psychedelic rock': 11, 'funk rock': 17, 'funk metal': 15, 'permanent wave': 23, 'djent': 6, 'progressive metal': 7, 'french metal': 6, 'nu metal': 20, 'metal': 8, 'alternative metal': 27, 'instrumental rock': 9, 'garage rock': 7, 'el paso indie': 8, 'grunge': 13, 'noise pop': 5, 'neo-psychedelic': 9, 'rap metal': 6, 'microtonal': 9, 'double drumming': 8, 'australian psych': 8, 'modern rock': 8, 'shoegaze': 5}
Cluster 9 (length: 13): {'funk rock': 5, 'progressive rock': 16, 'nu metal': 5, 'rock': 15, 'progressive metal': 13, 'alternative metal': 6, 'art rock': 12, 'soft rock': 6, 'album rock': 10, 'hard rock': 10, 'symphonic rock': 9, 'classic rock': 10, 'metal': 8}
As most tracks have several genres, this poses a problem grouping, as when training the model, there are several rows with the same input features, but a different "target" (as it's a non-supervised ML model, there is no real target in the model, but the genres that a track pertains are known, and we use this information to assess the clustering), there are genres that can be in several clusters. Although this could help to generalize the model, with a dataset of this size (with unequal distribution of genres also), it's more problematic.
Clusters are not explained in detail, but it can be seen that some of them are very huge (cluster 0 contains around 75% of the genres in the dataset), and they are all very similar in general (they all contain mixes of rock and metal). It does not seem a good clustering, as there is no clear distinction between genres. A filtered list is made, with only the genres in each cluster that appear more than 5 times, to exclude those with low representation. This filtering can help, but the results show that the clusters are still very mixed, without a clear sepparation.
for i in range(0, 15):
_df = track_info_labeled.loc[track_info_labeled["cluster"] == i, ["track_name", "artist", "track_url"]]
elements = min(5, _df.shape[0])
print(f"Cluster {i}")
print(_df.sample(elements).values)
Cluster 0 [['Naked in the Rain' 'Red Hot Chili Peppers' 'https://open.spotify.com/track/6xid2NOIS1aqviPPe67OqF'] ['Bliss' 'Muse' 'https://open.spotify.com/track/0j3obufLXq5toSs592dX9U'] ["Tonight's Music" 'Katatonia' 'https://open.spotify.com/track/0yQQYRWf3EoW4kNQK2MAHD'] ['Bloodbath (feat. Chino Moreno)' 'Polyphia' 'https://open.spotify.com/track/2IMHE3XJcsqTIbSGOIY6Jy'] ['Obstacle 1' 'Interpol' 'https://open.spotify.com/track/0vrlOGhCwK3xqy1xNUQJHj']] Cluster 1 [['Crumbling Castle' 'King Gizzard & The Lizard Wizard' 'https://open.spotify.com/track/0NDidThJQ6nbftCXAbHjp5'] ['Soul Sacrifice - Live at The Woodstock Music & Art Fair, August 16, 1969' 'Santana' 'https://open.spotify.com/track/7zAoVpCLJFsyRfCbGUIAFf'] ['Radioactive Toy' 'Porcupine Tree' 'https://open.spotify.com/track/4UPSHx4o2ZoWA3CsXlteon'] ['Reflection' 'TOOL' 'https://open.spotify.com/track/0R7HFX1LW3E0ZR5BnAJLHz'] ['The Czar: Usurper / Escape / Martyr / Spiral' 'Mastodon' 'https://open.spotify.com/track/2LMjQnKH7sQzOD0l8q6eWz']] Cluster 2 [['High Times - Remastered' 'Jamiroquai' 'https://open.spotify.com/track/4LGZzACsdcdtx9FxIukGsS'] ['The National Anthem' 'Radiohead' 'https://open.spotify.com/track/4Wgj6jzoI2gYlumXdYAB8U'] ['The Shooting Star' 'Gojira' 'https://open.spotify.com/track/6HQfFAupOMsmfWV4CbG1Kj'] ['The Lesson' 'Victor Wooten' 'https://open.spotify.com/track/7K2kv5hSuJPi61vYJeGKEO'] ['Wolf down the earth' 'Gojira' 'https://open.spotify.com/track/368sgQMnEzwo5mpShvPFJm']] Cluster 3 [['The Dripping Tap' 'King Gizzard & The Lizard Wizard' 'https://open.spotify.com/track/0o6rOggbaLEvtwUHNztuD2'] ['Tarkus - i. Eruption / ii. Stones of Years / iii. Iconoclast / iv. Mass / v. Manticore / vi. Battlefield / vii. Aquatarkus; 2012 Remaster' 'Emerson, Lake & Palmer' 'https://open.spotify.com/track/53XV0umjeHpsobMTPzcwc3'] ['In the Presence of Enemies - Part II' 'Dream Theater' 'https://open.spotify.com/track/1owCsgj8PwFdkf5xeG50Ig'] ['Anesthetize - Live' 'Porcupine Tree' 'https://open.spotify.com/track/5buRmG2pBjM8HlZAWIud5d'] ['Escalator Shrine - Live in Tilburg' 'Riverside' 'https://open.spotify.com/track/4O6N871ISfiwmzk5qxzkhI']] Cluster 4 [['21st Century Schizoid Man' 'King Crimson' 'https://open.spotify.com/track/5yClziwiwTdqRmdPQl3NDz'] ['Ticks & Leeches' 'TOOL' 'https://open.spotify.com/track/0Cnx6PGogxIE2RnDcnoeK8'] ['Harper Lewis' 'Russian Circles' 'https://open.spotify.com/track/2iG7MCJdoM4NfvsK2AVgqY'] ['Open Water' 'King Gizzard & The Lizard Wizard' 'https://open.spotify.com/track/0odQCRSjtsGaRCkAgeeN1D'] ['Pink Maggit' 'Deftones' 'https://open.spotify.com/track/0FrT8a39eo6siL1yIxVGTP']] Cluster 5 [['A Letter To Elise' 'The Cure' 'https://open.spotify.com/track/7mEGddVRDdESAibWOnbXoA'] ['Pitchless Tone' 'The Mercury Tree' 'https://open.spotify.com/track/4PzkOfsxQZNFVvIMBPwo5W'] ['Halo' 'Porcupine Tree' 'https://open.spotify.com/track/7caQujgL286vqNHnCdaclg'] ["Cat's Eye/Yellow Fever (Running)" 'Van Der Graaf Generator' 'https://open.spotify.com/track/126UEF2A18PA8xNnYUkt42'] ['Showbiz' 'Muse' 'https://open.spotify.com/track/2sCFFlnYg6Lk75GCcfSXEz']] Cluster 6 [['Improv: A Voyage To The Centre Of The Cosmos [Bonus Track]' 'King Crimson' 'https://open.spotify.com/track/6ZtLxpmKe20h0NJJQLMRbh'] ['Starless' 'King Crimson' 'https://open.spotify.com/track/1Kt1j54YhvP39PnSQjU8H3'] ['Cicatriz Esp' 'The Mars Volta' 'https://open.spotify.com/track/3I7gJOfTamHWpDfbwQUO6T'] ['The Curtain - Live From Dordrecht, Het Energiehuis / 2014' 'Snarky Puppy' 'https://open.spotify.com/track/29ls98FgNdZHbmqdQeF7E6'] ['Advent - Live' 'Opeth' 'https://open.spotify.com/track/0pgGKQ6pUXEX9FkvWksiuq']] Cluster 7 [['Venus & Mars' 'Jack The Joker' 'https://open.spotify.com/track/3VnrO2i3uFCkXE6n5ROvkR'] ['Octavarium' 'Dream Theater' 'https://open.spotify.com/track/4TZo49HN2MkbWmHMTf4NcH'] ['Tubular Bells - Pt. I' 'Mike Oldfield' 'https://open.spotify.com/track/7ERSQrRptZVM7q3VOdM7OL'] ["Supper's Ready - Digital Remastered 2008" 'Genesis' 'https://open.spotify.com/track/5LsZlfvxBN6zs4ZjlW0aFC'] ['Visions - remastered 2017' 'Haken' 'https://open.spotify.com/track/4oBKsjV5vzQ4pIlkp6Jfk4']] Cluster 8 [['Nervous Waltz' 'Igorrr' 'https://open.spotify.com/track/0GOytj5zpFRpjGN04FLmHT'] ['Master Exploder' 'Tenacious D' 'https://open.spotify.com/track/1EXfR9HR6cM1y6qagNQW4g'] ['Peter Gunn (Live 1977)' 'Emerson, Lake & Palmer' 'https://open.spotify.com/track/1XvnKIRg3wxDkN6MaW8KZC'] ['Eclipse - 2011 Remastered Version' 'Pink Floyd' 'https://open.spotify.com/track/3Z2RsIdWm4BNbT0LsFBuoN'] ['What Did He Say?' 'Victor Wooten' 'https://open.spotify.com/track/5VJBQgRaE0M8PwZJQVaQzY']] Cluster 9 [['Interstellar Overdrive - 2011 Remaster' 'Pink Floyd' 'https://open.spotify.com/track/63KHfOzsogBZO5LPmxCyVF'] ['Upside Down' 'Gazpacho' 'https://open.spotify.com/track/69xUkf647IyVn8cJtQ4zRk'] ['Quarter Master' 'Snarky Puppy' 'https://open.spotify.com/track/2zAGtTlxNoCKdhvdYfgCrU'] ['The art of dying' 'Gojira' 'https://open.spotify.com/track/7iMQChXFK33TS49QWhE4tt'] ['Iron Lung' 'King Gizzard & The Lizard Wizard' 'https://open.spotify.com/track/7G69PANdawtb2rsG2ZGEQa']] Cluster 10 [] Cluster 11 [] Cluster 12 [] Cluster 13 [] Cluster 14 []
Although some clusters seem ok (the songs seem to share some traits), clusters seem very mixed, with a little bit of everything in each one, so it's not a good grouping, as one could easily assign a random cluster to a track and it has chances to align with the ML clustering.
Finally, a GMM model is fitted to the data to see if its results are better than KMeans.
The GMM model gives a probability for a data point to pertain to a certain cluster. We save the output with the highest probability, as well as all results that have a probability higher than a certain threshold (in those cases, the data point can be part of more than one cluster). We compare the results.
lower_limit = 0.5
gmm_components = 10
model = GaussianMixture(covariance_type="tied", n_components=gmm_components, random_state=random_state)
model.fit(X_scaled)
results = model.predict(X_scaled)
probabilities = model.predict_proba(X_scaled)
track_clustered = track_info.copy()
track_clustered["cluster_single"] = results
track_clustered["clusters"] = probabilities.tolist()
track_clustered["clusters"] = track_clustered["clusters"].apply(lambda x: [i for i, e in enumerate(x) if e > lower_limit])
track_clustered.head(5)
| num_sections | danceability | track_name | sections_avg_duration | instrumentalness | liveness | track_url | loudness | duration | speechiness | ... | artist_id | key_changes | key | tempo | genres | explicit | case | cluster_NLP | cluster_single | clusters | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7.0 | 0.677 | No You Girls | 31.970 | 0.000077 | 0.0967 | https://open.spotify.com/track/4VP8QiCeaZq8BeT... | -4.102 | 223.79 | 0.0296 | ... | 0XNa1vTidXlvJ2gHSsRi4A | 0.000000 | D | 104.780 | [rock, modern rock, indie rock, alternative rock] | False | user | [3, 7] | 2 | [2] |
| 1 | 14.0 | 0.449 | Cliffs Of Dover - Instrumental | 17.841 | 0.149000 | 0.2480 | https://open.spotify.com/track/5qm0KiVKMXW1kq6... | -12.029 | 249.77 | 0.0405 | ... | 4CxobvwTpmfpIEbkYh4pAb | 0.008007 | G | 94.907 | [progressive jazz fusion, instrumental rock] | False | user | [3, 5] | 1 | [1] |
| 2 | 9.0 | 0.526 | Genesis Ch.1. V.32 | 23.241 | 0.984000 | 0.1100 | https://open.spotify.com/track/2Pmkm67wkf5ucIO... | -9.747 | 209.17 | 0.0372 | ... | 2m62cc253Xvd9qYQ8d2X3d | 0.000000 | D | 126.610 | [soft rock, progressive rock, mellow gold, alb... | False | user | [3, 4, 7] | 2 | [2] |
| 3 | 13.0 | 0.745 | Stop Don't Panic | 20.936 | 0.588000 | 0.1880 | https://open.spotify.com/track/38rSSEjYzSngmL0... | -5.549 | 272.17 | 0.0502 | ... | 6J7biCazzYhU3gM9j1wfid | 0.000000 | C# | 112.450 | [dance pop] | False | user | [1] | 8 | [8] |
| 4 | 27.0 | 0.270 | America - 2003 Remaster | 23.392 | 0.283000 | 0.3010 | https://open.spotify.com/track/5jleoXgOf41HdLI... | -8.755 | 631.57 | 0.0630 | ... | 7AC976RDJzL2asmZuz7qil | 0.004750 | D | 176.030 | [soft rock, progressive rock, album rock, rock... | False | user | [3, 7] | 2 | [2] |
5 rows × 33 columns
genre_info_clustered = track_clustered.explode("genres")
genre_info_clustered["genres"] = genre_info_clustered["genres"].fillna("No Genre")
cluster_items = genre_info_clustered[["cluster_single", "genres"]].groupby("cluster_single").agg(list).reset_index()
cluster_items["genres"] = cluster_items["genres"].apply(lambda x: list(set(x)))
for _, row in cluster_items.iterrows():
print(f"Cluster {row['cluster_single']} (length: {len(row['genres'])}): {', '.join(row['genres'])}\n")
Cluster 0 (length: 70): jam band, funk metal, stoner rock, singer-songwriter, french metal, garage rock, glam rock, album rock, mellow gold, microtonal, french death metal, groove metal, jazz funk, melodic thrash, classic rock, uk post-punk, old school thrash, new wave, modern rock, instrumental rock, swedish progressive metal, contemporary jazz, stoner metal, technical thrash, indie rock, rap metal, rock, hard rock, jazz, acid rock, djent, instrumental funk, el paso indie, australian psych, math rock, supergroup, soft rock, oxford indie, progressive rock, sacramento indie, nu metal, symphonic rock, swedish metal, shoegaze, psychedelic rock, funk rock, permanent wave, speed metal, progressive jazz fusion, classic canadian rock, neo-psychedelic, double drumming, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, grunge, palm desert scene, metal, canadian metal, thrash metal, alternative metal, blues rock, electric bass, progressive death metal, jazz fusion, technical groove metal, dance pop, art rock Cluster 1 (length: 69): jam band, funk metal, stoner rock, conscious hip hop, french metal, garage rock, zolo, album rock, microtonal, french death metal, groove metal, p funk, melodic thrash, classic rock, old school thrash, jazz funk, modern rock, instrumental rock, swedish progressive metal, contemporary jazz, stoner metal, indie rock, rap metal, rock, hard rock, jazz, djent, instrumental funk, el paso indie, funk, jazz rock, australian psych, supergroup, soft rock, oxford indie, progressive rock, sacramento indie, nu metal, symphonic rock, swedish metal, rap rock, psychedelic rock, funk rock, permanent wave, speed metal, progressive jazz fusion, neo-psychedelic, political hip hop, double drumming, classic canadian rock, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, grunge, palm desert scene, metal, canadian metal, jazz metal, thrash metal, instrumental djent, alternative metal, blues rock, electric bass, progressive death metal, jazz fusion, dance pop, art rock Cluster 2 (length: 74): jam band, funk metal, stoner rock, garage rock, french metal, instrumental math rock, mellow gold, album rock, zolo, microtonal, glam rock, french death metal, groove metal, jazz funk, melodic thrash, classic rock, old school thrash, uk post-punk, new wave, modern rock, instrumental rock, swedish progressive metal, post-rock, contemporary jazz, indie rock, stoner metal, rap metal, rock, hard rock, jazz, acid rock, djent, instrumental funk, el paso indie, funk, jazz rock, australian psych, math rock, soft rock, oxford indie, progressive rock, sacramento indie, nu metal, symphonic rock, swedish metal, shoegaze, trip hop, psychedelic rock, funk rock, permanent wave, speed metal, progressive jazz fusion, neo-psychedelic, double drumming, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, grunge, palm desert scene, metal, p funk, jazz metal, thrash metal, instrumental djent, alternative metal, electric bass, blues rock, progressive death metal, jazz fusion, dance pop, noise pop, art rock Cluster 3 (length: 65): stoner rock, funk metal, conscious hip hop, garage rock, singer-songwriter, glam rock, album rock, mellow gold, microtonal, instrumental math rock, groove metal, jazz funk, melodic thrash, classic rock, old school thrash, modern rock, instrumental rock, contemporary jazz, stoner metal, rap metal, rock, hard rock, jazz, djent, instrumental funk, el paso indie, funk, australian psych, math rock, supergroup, soft rock, oxford indie, progressive rock, sacramento indie, nu metal, symphonic rock, rap rock, shoegaze, trip hop, psychedelic rock, funk rock, permanent wave, speed metal, progressive jazz fusion, neo-psychedelic, political hip hop, double drumming, classic canadian rock, post-grunge, melancholia, progressive metal, alternative rock, grunge, palm desert scene, p funk, metal, canadian metal, thrash metal, alternative metal, electric bass, blues rock, jazz fusion, dance pop, noise pop, art rock Cluster 4 (length: 72): jam band, funk metal, stoner rock, garage rock, french metal, instrumental math rock, mellow gold, album rock, zolo, microtonal, conscious hip hop, french death metal, groove metal, p funk, melodic thrash, classic rock, old school thrash, modern rock, instrumental rock, swedish progressive metal, post-rock, indie rock, technical thrash, stoner metal, rap metal, rock, hard rock, djent, acid rock, el paso indie, funk, jazz rock, australian psych, math rock, supergroup, soft rock, oxford indie, progressive rock, nu metal, symphonic rock, swedish metal, rap rock, trip hop, psychedelic rock, funk rock, permanent wave, speed metal, progressive jazz fusion, classic canadian rock, neo-psychedelic, double drumming, political hip hop, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, canadian metal, grunge, metal, palm desert scene, jazz metal, thrash metal, instrumental djent, alternative metal, blues rock, progressive death metal, jazz fusion, technical groove metal, dance pop, noise pop, art rock Cluster 5 (length: 30): french metal, instrumental math rock, mellow gold, glam rock, album rock, french death metal, groove metal, classic rock, modern rock, instrumental rock, technical thrash, rap metal, rock, djent, soft rock, progressive rock, sacramento indie, nu metal, symphonic rock, swedish metal, shoegaze, permanent wave, progressive jazz fusion, progressive groove metal, progressive metal, alternative rock, metal, alternative metal, technical groove metal, art rock Cluster 6 (length: 67): jam band, stoner rock, funk metal, garage rock, french metal, conscious hip hop, zolo, album rock, singer-songwriter, microtonal, french death metal, groove metal, jazz funk, melodic thrash, classic rock, old school thrash, uk post-punk, new wave, modern rock, instrumental rock, swedish progressive metal, contemporary jazz, indie rock, rap metal, rock, hard rock, jazz, djent, acid rock, el paso indie, jazz rock, australian psych, math rock, supergroup, soft rock, oxford indie, progressive rock, sacramento indie, nu metal, symphonic rock, swedish metal, rap rock, psychedelic rock, trip hop, funk rock, permanent wave, speed metal, progressive jazz fusion, classic canadian rock, political hip hop, double drumming, neo-psychedelic, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, canadian metal, grunge, metal, thrash metal, alternative metal, electric bass, progressive death metal, jazz fusion, dance pop, art rock Cluster 7 (length: 2): garage rock, el paso indie Cluster 8 (length: 63): funk metal, stoner rock, singer-songwriter, french metal, instrumental math rock, zolo, album rock, mellow gold, microtonal, french death metal, groove metal, jazz funk, classic rock, modern rock, instrumental rock, post-rock, contemporary jazz, stoner metal, technical thrash, rap metal, rock, hard rock, jazz, djent, instrumental funk, funk, jazz rock, australian psych, supergroup, soft rock, oxford indie, sacramento indie, progressive rock, nu metal, symphonic rock, swedish metal, psychedelic rock, trip hop, funk rock, permanent wave, progressive jazz fusion, classic canadian rock, neo-psychedelic, double drumming, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, p funk, grunge, palm desert scene, metal, jazz metal, canadian metal, instrumental djent, alternative metal, electric bass, blues rock, jazz fusion, technical groove metal, dance pop, art rock Cluster 9 (length: 63): jam band, stoner rock, funk metal, singer-songwriter, french metal, garage rock, mellow gold, album rock, glam rock, microtonal, instrumental math rock, french death metal, groove metal, jazz funk, classic rock, modern rock, instrumental rock, post-rock, contemporary jazz, stoner metal, indie rock, technical thrash, rap metal, rock, hard rock, jazz, djent, acid rock, instrumental funk, el paso indie, funk, math rock, soft rock, oxford indie, sacramento indie, progressive rock, nu metal, symphonic rock, swedish metal, shoegaze, psychedelic rock, funk rock, permanent wave, progressive jazz fusion, classic canadian rock, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, grunge, palm desert scene, p funk, metal, canadian metal, alternative metal, electric bass, blues rock, jazz fusion, technical groove metal, dance pop, noise pop, art rock
for i in range(0, gmm_components):
_df = track_clustered.loc[track_clustered["cluster_single"] == i, ["track_name", "artist", "track_url"]]
elements = min(5, _df.shape[0])
print(f"Cluster {i}")
print(_df.sample(elements).values)
Cluster 0 [['This Day We Fight!' 'Megadeth' 'https://open.spotify.com/track/4NVzKq9K1Ryq9PSE9YyFuS'] ['Kalamazoo' 'Primus' 'https://open.spotify.com/track/0n3qQAR9kYNLDHV06vO3dD'] ['Echoplex' 'Nine Inch Nails' 'https://open.spotify.com/track/7EFBGCEsHR1MzZZMayDtXS'] ['Repent Walpurgis - 2009 Remaster Mono' 'Procol Harum' 'https://open.spotify.com/track/0uZNev6gpoxARwDLZ39wXq'] ['Ænema' 'TOOL' 'https://open.spotify.com/track/0nLOl4fSiBZKGFla5pLUtf']] Cluster 1 [['Beastly (Live at Madison Square Garden)' 'Vulfpeck' 'https://open.spotify.com/track/51IZyG8h72MiYJgEv7PWTs'] ['Calm Like a Bomb' 'Rage Against The Machine' 'https://open.spotify.com/track/1hR0fIFK2qRG3f3RF70pb7'] ['21st Century Schizoid Man' 'King Crimson' 'https://open.spotify.com/track/5yClziwiwTdqRmdPQl3NDz'] ['New Found' 'Gojira' 'https://open.spotify.com/track/3Kl4f3sVKb8iwQ8ZffHim6'] ['The Coma Machine' 'Between The Buried And Me' 'https://open.spotify.com/track/5EM6ySpK1SYeof4jm72psM']] Cluster 2 [['Cotopaxi' 'The Mars Volta' 'https://open.spotify.com/track/54CBNvVv91AkRUopvZUi2z'] ['Incinerate' 'Sonic Youth' 'https://open.spotify.com/track/6bGnkppjxJJY4HeOQ5dheD'] ['Supertwister' 'Camel' 'https://open.spotify.com/track/0FZvr22qlD6sEJ0RDJWmvE'] ["Feet Don't Fail Me" 'Queens of the Stone Age' 'https://open.spotify.com/track/2JB7PvDV0R3Vwbq0iy1WPe'] ['Over The Electric Grapevine' 'Primus' 'https://open.spotify.com/track/42mybYyPzY5S6btIgjMmD6']] Cluster 3 [['Peace Sells - 2004 Remaster' 'Megadeth' 'https://open.spotify.com/track/3090goAxG6IlpCifA8m9xB'] ['Culling Voices' 'TOOL' 'https://open.spotify.com/track/3gPxMQWDMSEyPXMtzbcDdQ'] ['If I Had a Tail' 'Queens of the Stone Age' 'https://open.spotify.com/track/06hBdrgjUendZyH9U1WV22'] ['Dean Town' 'Vulfpeck' 'https://open.spotify.com/track/1oOD1pV43cV9sHg97aBdLs'] ['The Glass Prison' 'Dream Theater' 'https://open.spotify.com/track/4IuVWyD0fgFvX6V5CL0mHC']] Cluster 4 [['Pigs (Three Different Ones) - 2011 Remaster' 'Pink Floyd' 'https://open.spotify.com/track/23dTQ4fjLrPPbYHamkJDzo'] ['The Fountain Of Salmacis - Digital Remastered 2008' 'Genesis' 'https://open.spotify.com/track/28ZKFPssg1TAgVVw8GKDRJ'] ['Creeping Death - Remastered' 'Metallica' 'https://open.spotify.com/track/4CLRLdxUDPNeO1xCW0U2zC'] ['Metatron' 'The Mars Volta' 'https://open.spotify.com/track/2MM7824ht8LrT5oa00bgt8'] ['First It Giveth' 'Queens of the Stone Age' 'https://open.spotify.com/track/0UEMTz9APWfoJHdlXDeIzm']] Cluster 5 [['Minerva' 'Deftones' 'https://open.spotify.com/track/1gzWd0ILFaCoHUfQSkCIvl'] ['Kodama' 'Alcest' 'https://open.spotify.com/track/300bSfVhnNrXHZEHKxQjPU'] ['Landmines' 'Rishloo' 'https://open.spotify.com/track/0nMKVMCXCmRplsKKLt1TTh'] ["Fool's Overture" 'Supertramp' 'https://open.spotify.com/track/5pSSEkT0963muzzIjsVkrs'] ['Selenium Forest' 'Plini' 'https://open.spotify.com/track/18pIsa1XH5Eap4SBcSH4Xd']] Cluster 6 [['Cherub Rock - 2011 Remaster' 'The Smashing Pumpkins' 'https://open.spotify.com/track/3ow0TQVttXQF8rLckmXgRx'] ['Of Mind - Exile' 'TesseracT' 'https://open.spotify.com/track/23G48RKz9osJ0JINJ9fc8Y'] ['Lovesong - Remastered' 'The Cure' 'https://open.spotify.com/track/2mIrfke7vosXAEWfz6ucyo'] ['Shrinking Universe' 'Muse' 'https://open.spotify.com/track/4lUWIcM7hNaDeoIiD0NJSS'] ['Blood Sugar Sex Magik' 'Red Hot Chili Peppers' 'https://open.spotify.com/track/4ddA4oHvhrGx196qF5ZyQe']] Cluster 7 [['Dyslexicon' 'The Mars Volta' 'https://open.spotify.com/track/07DiJLA3Qef8OT23D4qlWC']] Cluster 8 [['Malignant Narcissism (De Slagwerker) - Snakes & Arrows Live Version' 'Rush' 'https://open.spotify.com/track/0RgoYqO4DqiLakL7gEX8Pl'] ['Karn Evil 9 1st Impression, Pt. 2 - 2014 Remaster' 'Emerson, Lake & Palmer' 'https://open.spotify.com/track/0nDQu5i6B93GvUJH8iJ0y9'] ['Tempting Time' 'Animals As Leaders' 'https://open.spotify.com/track/3TwKLIOl2hIPDE7z7rvOH3'] ['What Did He Say?' 'Victor Wooten' 'https://open.spotify.com/track/5VJBQgRaE0M8PwZJQVaQzY'] ['Blew' 'Nirvana' 'https://open.spotify.com/track/7pETV41GUutaZ6KMHMAYIH']] Cluster 9 [["You Know You're Right" 'Nirvana' 'https://open.spotify.com/track/5O8L4I2S4izFGDjvP7xAKv'] ['The Gift of Guilt' 'Gojira' 'https://open.spotify.com/track/5ke0lqEJZM2nAD3aRMxfV2'] ['Juno' 'TesseracT' 'https://open.spotify.com/track/1xj9Mdr9jtBpRA6WqqMYbn'] ['Talk Show Host' 'Radiohead' 'https://open.spotify.com/track/3cMuGOGSaTWbwOurTS4b3Y'] ['The Raven That Refused to Sing' 'Steven Wilson' 'https://open.spotify.com/track/69WzNkdbqaJEK6XanOkB4P']]
genre_info_clustered = track_clustered.explode("genres")
genre_info_clustered["genres"] = genre_info_clustered["genres"].fillna("No Genre")
genre_info_clustered = genre_info_clustered.explode("clusters")
cluster_items = genre_info_clustered[["clusters", "genres"]].groupby("clusters").agg(list).reset_index()
cluster_items["genres"] = cluster_items["genres"].apply(lambda x: list(set(x)))
for _, row in cluster_items.iterrows():
print(f"Cluster {row['clusters']} (length: {len(row['genres'])}): {', '.join(row['genres'])}\n")
Cluster 0 (length: 70): jam band, funk metal, stoner rock, singer-songwriter, french metal, garage rock, glam rock, album rock, mellow gold, microtonal, french death metal, groove metal, jazz funk, melodic thrash, classic rock, uk post-punk, old school thrash, new wave, modern rock, instrumental rock, swedish progressive metal, contemporary jazz, stoner metal, technical thrash, indie rock, rap metal, rock, hard rock, jazz, acid rock, djent, instrumental funk, el paso indie, australian psych, math rock, supergroup, soft rock, oxford indie, progressive rock, sacramento indie, nu metal, symphonic rock, swedish metal, shoegaze, psychedelic rock, funk rock, permanent wave, speed metal, progressive jazz fusion, classic canadian rock, neo-psychedelic, double drumming, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, grunge, palm desert scene, metal, canadian metal, thrash metal, alternative metal, blues rock, electric bass, progressive death metal, jazz fusion, technical groove metal, dance pop, art rock Cluster 1 (length: 69): jam band, funk metal, stoner rock, conscious hip hop, french metal, garage rock, zolo, album rock, microtonal, french death metal, groove metal, p funk, melodic thrash, classic rock, old school thrash, jazz funk, modern rock, instrumental rock, swedish progressive metal, contemporary jazz, stoner metal, indie rock, rap metal, rock, hard rock, jazz, djent, instrumental funk, el paso indie, funk, jazz rock, australian psych, supergroup, soft rock, oxford indie, progressive rock, sacramento indie, nu metal, symphonic rock, swedish metal, rap rock, psychedelic rock, funk rock, permanent wave, speed metal, progressive jazz fusion, neo-psychedelic, political hip hop, double drumming, classic canadian rock, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, grunge, palm desert scene, metal, canadian metal, jazz metal, thrash metal, instrumental djent, alternative metal, blues rock, electric bass, progressive death metal, jazz fusion, dance pop, art rock Cluster 2 (length: 74): jam band, funk metal, stoner rock, garage rock, french metal, instrumental math rock, mellow gold, album rock, zolo, microtonal, glam rock, french death metal, groove metal, jazz funk, melodic thrash, classic rock, old school thrash, uk post-punk, new wave, modern rock, instrumental rock, swedish progressive metal, post-rock, contemporary jazz, indie rock, stoner metal, rap metal, rock, hard rock, jazz, acid rock, djent, instrumental funk, el paso indie, funk, jazz rock, australian psych, math rock, soft rock, oxford indie, progressive rock, sacramento indie, nu metal, symphonic rock, swedish metal, shoegaze, trip hop, psychedelic rock, funk rock, permanent wave, speed metal, progressive jazz fusion, neo-psychedelic, double drumming, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, grunge, palm desert scene, metal, p funk, jazz metal, thrash metal, instrumental djent, alternative metal, electric bass, blues rock, progressive death metal, jazz fusion, dance pop, noise pop, art rock Cluster 3 (length: 65): stoner rock, funk metal, conscious hip hop, garage rock, singer-songwriter, glam rock, album rock, mellow gold, microtonal, instrumental math rock, groove metal, jazz funk, melodic thrash, classic rock, old school thrash, modern rock, instrumental rock, contemporary jazz, stoner metal, rap metal, rock, hard rock, jazz, djent, instrumental funk, el paso indie, funk, australian psych, math rock, supergroup, soft rock, oxford indie, progressive rock, sacramento indie, nu metal, symphonic rock, rap rock, shoegaze, trip hop, psychedelic rock, funk rock, permanent wave, speed metal, progressive jazz fusion, neo-psychedelic, political hip hop, double drumming, classic canadian rock, post-grunge, melancholia, progressive metal, alternative rock, grunge, palm desert scene, p funk, metal, canadian metal, thrash metal, alternative metal, electric bass, blues rock, jazz fusion, dance pop, noise pop, art rock Cluster 4 (length: 72): jam band, funk metal, stoner rock, garage rock, french metal, instrumental math rock, mellow gold, album rock, zolo, microtonal, conscious hip hop, french death metal, groove metal, p funk, melodic thrash, classic rock, old school thrash, modern rock, instrumental rock, swedish progressive metal, post-rock, indie rock, technical thrash, stoner metal, rap metal, rock, hard rock, djent, acid rock, el paso indie, funk, jazz rock, australian psych, math rock, supergroup, soft rock, oxford indie, progressive rock, nu metal, symphonic rock, swedish metal, rap rock, trip hop, psychedelic rock, funk rock, permanent wave, speed metal, progressive jazz fusion, classic canadian rock, neo-psychedelic, double drumming, political hip hop, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, canadian metal, grunge, metal, palm desert scene, jazz metal, thrash metal, instrumental djent, alternative metal, blues rock, progressive death metal, jazz fusion, technical groove metal, dance pop, noise pop, art rock Cluster 5 (length: 30): french metal, instrumental math rock, mellow gold, glam rock, album rock, french death metal, groove metal, classic rock, modern rock, instrumental rock, technical thrash, rap metal, rock, djent, soft rock, progressive rock, sacramento indie, nu metal, symphonic rock, swedish metal, shoegaze, permanent wave, progressive jazz fusion, progressive groove metal, progressive metal, alternative rock, metal, alternative metal, technical groove metal, art rock Cluster 6 (length: 67): jam band, stoner rock, funk metal, garage rock, french metal, conscious hip hop, zolo, album rock, singer-songwriter, microtonal, french death metal, groove metal, jazz funk, melodic thrash, classic rock, old school thrash, uk post-punk, new wave, modern rock, instrumental rock, swedish progressive metal, contemporary jazz, indie rock, rap metal, rock, hard rock, jazz, djent, acid rock, el paso indie, jazz rock, australian psych, math rock, supergroup, soft rock, oxford indie, progressive rock, sacramento indie, nu metal, symphonic rock, swedish metal, rap rock, psychedelic rock, trip hop, funk rock, permanent wave, speed metal, progressive jazz fusion, classic canadian rock, political hip hop, double drumming, neo-psychedelic, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, canadian metal, grunge, metal, thrash metal, alternative metal, electric bass, progressive death metal, jazz fusion, dance pop, art rock Cluster 7 (length: 2): garage rock, el paso indie Cluster 8 (length: 63): funk metal, stoner rock, singer-songwriter, french metal, instrumental math rock, zolo, album rock, mellow gold, microtonal, french death metal, groove metal, jazz funk, classic rock, modern rock, instrumental rock, post-rock, contemporary jazz, stoner metal, technical thrash, rap metal, rock, hard rock, jazz, djent, instrumental funk, funk, jazz rock, australian psych, supergroup, soft rock, oxford indie, sacramento indie, progressive rock, nu metal, symphonic rock, swedish metal, psychedelic rock, trip hop, funk rock, permanent wave, progressive jazz fusion, classic canadian rock, neo-psychedelic, double drumming, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, p funk, grunge, palm desert scene, metal, jazz metal, canadian metal, instrumental djent, alternative metal, electric bass, blues rock, jazz fusion, technical groove metal, dance pop, art rock Cluster 9 (length: 63): jam band, stoner rock, funk metal, singer-songwriter, french metal, garage rock, mellow gold, album rock, glam rock, microtonal, instrumental math rock, french death metal, groove metal, jazz funk, classic rock, modern rock, instrumental rock, post-rock, contemporary jazz, stoner metal, indie rock, technical thrash, rap metal, rock, hard rock, jazz, djent, acid rock, instrumental funk, el paso indie, funk, math rock, soft rock, oxford indie, sacramento indie, progressive rock, nu metal, symphonic rock, swedish metal, shoegaze, psychedelic rock, funk rock, permanent wave, progressive jazz fusion, classic canadian rock, progressive groove metal, post-grunge, melancholia, progressive metal, alternative rock, grunge, palm desert scene, p funk, metal, canadian metal, alternative metal, electric bass, blues rock, jazz fusion, technical groove metal, dance pop, noise pop, art rock
df_exploded = track_clustered.explode("clusters")
for i in range(0, gmm_components):
_df = df_exploded.loc[df_exploded["clusters"] == i, ["track_name", "artist", "track_url"]]
elements = min(5, _df.shape[0])
print(f"Cluster {i}")
print(_df.sample(elements).values)
Cluster 0 [['The Daily Mail' 'Radiohead' 'https://open.spotify.com/track/0Kwg1epDRFLyXFq09t7MtB'] ['Magma' 'King Gizzard & The Lizard Wizard' 'https://open.spotify.com/track/5nXO28PQrL5unzmFUJrdlT'] ['Stinkfist' 'TOOL' 'https://open.spotify.com/track/0pwObEOHolQZSldJ2q1wpy'] ['Clockworks' 'Meshuggah' 'https://open.spotify.com/track/5Q5JuwRUZQVNXTCoAkjgQf'] ['Dani California' 'Red Hot Chili Peppers' 'https://open.spotify.com/track/10Nmj3JCNoMeBQ87uw5j8k']] Cluster 1 [['Down Rodeo' 'Rage Against The Machine' 'https://open.spotify.com/track/40L8L7344pPMCjlZc19RQU'] ['Third Eye' 'TOOL' 'https://open.spotify.com/track/2Dqg2mRbfIVKhBZleNrgmH'] ['Right In Two' 'TOOL' 'https://open.spotify.com/track/0NLDZzVke3Qu7vDhWyGzRk'] ['7empest' 'TOOL' 'https://open.spotify.com/track/0gGfmw4csswZmFPj9YK8GW'] ['Herd Culling' 'Porcupine Tree' 'https://open.spotify.com/track/1OBPAhkQe8GvyPGVFiNuEI']] Cluster 2 [['Absolomb' 'Periphery' 'https://open.spotify.com/track/3cWMNY5QEgeFl7WOfn4gf3'] ['Just Another Story' 'Jamiroquai' 'https://open.spotify.com/track/1DLweHBYVlRDMYTGaFtJFR'] ["(Don't) Give Hate a Chance" 'Jamiroquai' 'https://open.spotify.com/track/2IJk5fQC6Qiu2gRrCR2977'] ['Fear of a Blank Planet - Live' 'Porcupine Tree' 'https://open.spotify.com/track/5JS752i84FtILtqjYxJ9qK'] ['Nature Boy' 'Primus' 'https://open.spotify.com/track/45z51HNfBAhKZ8D8tYBnxZ']] Cluster 3 [['Snow (Hey Oh)' 'Red Hot Chili Peppers' 'https://open.spotify.com/track/2aibwv5hGXSgw7Yru8IYTO'] ['Love Foolosophy - Radio Edit' 'Jamiroquai' 'https://open.spotify.com/track/0upgxxew2mVAEctrz08jnf'] ['Halo' 'Porcupine Tree' 'https://open.spotify.com/track/7lqZVOSl9uDx8l2TwxkLnV'] ['sea dragon (feat. Mario Camarena)' 'Covet' 'https://open.spotify.com/track/2bAsW3ElSqMVmmilHwSuRy'] ['Nuclear Fusion' 'King Gizzard & The Lizard Wizard' 'https://open.spotify.com/track/5iGPF5Vg43aZ2QTXslY6VA']] Cluster 4 [['Falling Away from Me' 'Korn' 'https://open.spotify.com/track/2F6FfZ4w8z3eJpSxPotVO5'] ["You're Lost Little Girl" 'The Doors' 'https://open.spotify.com/track/5onlaW8X1ps8VS4DhxpFom'] ['Cassandra Gemini: Con Safo' 'The Mars Volta' 'https://open.spotify.com/track/2M9csdSUksnG5P1D9U50mS'] ['One Armed Scissor' 'At the Drive-In' 'https://open.spotify.com/track/3T1YjH72qs6L6tDscZFZL5'] ['Megalomania' 'Muse' 'https://open.spotify.com/track/2S9tY6X04CTb9ZAA2PCpC2']] Cluster 5 [['Kodama' 'Alcest' 'https://open.spotify.com/track/300bSfVhnNrXHZEHKxQjPU'] ['Acceptance - Concealing Fate, Pt. 1' 'TesseracT' 'https://open.spotify.com/track/6zqrAN3xSWrTMTUE4vjgZB'] ['Only Pain' 'Gojira' 'https://open.spotify.com/track/7MuPOXU2vjEe3Kh9LfbM5v'] ['United States of Eurasia (+Collateral Damage)' 'Muse' 'https://open.spotify.com/track/0tHbQRjL5phd8OoYl2Bdnd'] ['Thoughts of a Dying Atheist' 'Muse' 'https://open.spotify.com/track/7LB6xhGZ0jCbP3PfUDA7yw']] Cluster 6 [['Saucy' 'Polyphia' 'https://open.spotify.com/track/6VcimclPSNITdEnc6gZGNQ'] ['Hysteria' 'Muse' 'https://open.spotify.com/track/7xyYsOvq5Ec3P4fr6mM9fD'] ['Break on Through (To the Other Side)' 'The Doors' 'https://open.spotify.com/track/6ToM0uwxtPKo9CMpbPGYvM'] ['Cherub Rock - 2011 Remaster' 'The Smashing Pumpkins' 'https://open.spotify.com/track/3ow0TQVttXQF8rLckmXgRx'] ['Polyphonic Rust' 'Igorrr' 'https://open.spotify.com/track/3897Y6rgHaKjklzhgctuqu']] Cluster 7 [['Dyslexicon' 'The Mars Volta' 'https://open.spotify.com/track/07DiJLA3Qef8OT23D4qlWC']] Cluster 8 [['Of Reality - Eclipse' 'TesseracT' 'https://open.spotify.com/track/5W7oPYXq0iLn6KsAuMF6Ed'] ['I Am a Husk' 'The Mercury Tree' 'https://open.spotify.com/track/0NsQVpzw8lqoVYmSqVeAhu'] ['Soraya' 'Animals As Leaders' 'https://open.spotify.com/track/0QZ3iZwNJHS9SXXyiJsU30'] ['Malignant Narcissism (De Slagwerker) - Snakes & Arrows Live Version' 'Rush' 'https://open.spotify.com/track/0RgoYqO4DqiLakL7gEX8Pl'] ['Layla' 'Derek & The Dominos' 'https://open.spotify.com/track/2kkvB3RNRzwjFdGhaUA0tz']] Cluster 9 [['Heavy Metal Machine' 'The Smashing Pumpkins' 'https://open.spotify.com/track/2V8xAzhhqPIiN2zvzyZf58'] ['Xanadu' 'Rush' 'https://open.spotify.com/track/5q7KiSN7znIONR76LAqm7k'] ['The Gallery' 'Muse' 'https://open.spotify.com/track/2b7lgdr7kkAnsyAJ1Tdp2V'] ['Portal' 'Snarky Puppy' 'https://open.spotify.com/track/4yjERvT86l7HgVQjj3j3BP'] ['Ruled by Secrecy' 'Muse' 'https://open.spotify.com/track/5Zm5oxx8yyLKOAJf1knPmR']]
Results are similar to those of KMeans: clusters of very high size (some of more than half of classes), and seem very mixed, with a little bit of everything in almost all of them. It's not a good classification.
It has been seen that clustering when there is a high amount of classes can be a very challenging. We know a priori that some genres are related (rock and its subgenres are all "rock" music), but trying to automatize the grouping without manually assigning labels to each genre has been a little problematic. This shows the difficulty in doing a ML algorithm with a subjective target (we know that, for example, indie and alternative music can have similarities, and they usually are very different from, for example, extreme metal or classical music), and with so much variety in the classes.
Reducing the dataset to include only the most represented genres, the results are somewhat ok with the semantic clustering, with groups of moderate size and sensible aggregations. Also, being a more subjective classification, the semantic transformer may include some subjective relationship between the words used in the genres, so this helps in clustering. This approach also shows that, in some cases, it's better (both faster and with better results) to use a pre-trained model that fits to the task than "reinventing the wheel" and trying to construct a complex model.
When using features to cluster, the results are way off, mainly because a song can have several genres, and not all of them may apply to the song, as it's an artist info; also, there is no quantification of "how much" a song pertains to a certain genre, so this can't be simplified in an easy way; and finally, it can be debated if a song or artist is of a certain genre according to the Spotify classification.